Skip to content

Commit

Permalink
performance optimizations from tuning + roofline
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 committed Oct 29, 2022
1 parent 5fbf4ff commit 45ccae6
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 62 deletions.
3 changes: 3 additions & 0 deletions cmake/Templates/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
// number of X-axis samples in the histogram
#define Xsamples 128

// y-axis multiplier for experimental spectra data
#define YAXISMULTIPLIER 100

// FUTURE: Only b- and y-ions used in current implementation

// Define the Ion Series and add it to iSERIES
Expand Down
2 changes: 1 addition & 1 deletion source/apps/argp/argp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ void getParams(gParams &params)
params.min_cpsm = parser.hits;

// Base Intensity x 1000
params.base_int = parser.base_int * 1000;
params.base_int = parser.base_int * YAXISMULTIPLIER;

// Cutoff intensity ratio (add 0.5 for nearest rounding)
params.min_int = static_cast<double_t>(params.base_int) * parser.cutoff + 0.5;
Expand Down
3 changes: 0 additions & 3 deletions source/core/cuda/superstep1/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -730,9 +730,6 @@ __global__ void GenerateFragIonData(uint_t *d_fragIon, pepEntry *d_pepEntry, cha
Spectrum[idx] = (myVal + PROTONS(myCharge)) * scale / myCharge;
}
}
// do we need this?
// __syncthreads();
}
// -------------------------------------------------------------------------------------------- //
Expand Down
2 changes: 1 addition & 1 deletion source/core/cuda/superstep2/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ std::array<int, 2> readAndPreprocess(string_t &filename)
if constexpr (std::is_same<int, spectype_t>::value)
{
mzs[m_idx] = std::atof(mz.c_str()) * params.scale;
intns[m_idx] = std::atof(intn.c_str()) * 1000;
intns[m_idx] = std::atof(intn.c_str()) * YAXISMULTIPLIER;
}
else
{
Expand Down
77 changes: 35 additions & 42 deletions source/core/cuda/superstep3/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -580,19 +580,28 @@ __host__ status_t SearchKernel(spmat_t *mat, Queries<spectype_t> *gWorkPtr, int
auto d_BYC = std::get<0>(pBYC);
auto maxchunk = std::get<1>(pBYC);
// set the shared memory to 48KB
cudaFuncSetAttribute(SpSpGEMM, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(48));
// block size is 512 for open-search
int blocksize = 512;
// set block size to 320 for closed-search
if (params.dM < 50)
blocksize = 320;
// calculate shmembytes needed
int shmembytes = std::max((unsigned long)HISTOGRAM_SIZE, (sizeof(BYC) + sizeof(double)) * blocksize);
// set the shared memory to 16KB
cudaFuncSetAttribute(SpSpGEMM, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(16));
for (int iter = 0 ; iter < niters ; iter++)
{
int nblocks = itersize;
const int blocksize = 320; // block size of 256 or 320 are good for better load factor
// if last iteration, adjust the number of blocks
if (iter == niters - 1)
nblocks = nspectra - iter * itersize;
hcp::gpu::cuda::s3::SpSpGEMM<<<nblocks, blocksize, KBYTES(32), driver->stream[SEARCH_STREAM]>>>(d_WorkPtr->moz, d_WorkPtr->intensity, d_WorkPtr->idx, d_WorkPtr->minlimits, d_WorkPtr->maxlimits, d_bA, d_iA, iter * itersize, d_BYC, maxchunk, d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, params.dF, speclen, params.max_mass, params.scale, params.min_shp, ixx);
hcp::gpu::cuda::s3::SpSpGEMM<<<nblocks, blocksize, shmembytes, driver->stream[SEARCH_STREAM]>>>(d_WorkPtr->moz, d_WorkPtr->intensity, d_WorkPtr->idx, d_WorkPtr->minlimits, d_WorkPtr->maxlimits, d_bA, d_iA, iter * itersize, d_BYC, maxchunk, d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, params.dF, speclen, params.max_mass, params.scale, params.min_shp, ixx);
// synchronize the stream
driver->stream_sync(SEARCH_STREAM);
Expand Down Expand Up @@ -721,26 +730,19 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
{
uint_t raw = d_iA[ion];
/* Calculate parent peptide ID */
int_t ppid = (raw / speclen);
/* Calculate the residue */
int_t residue = (raw % speclen);
/* Calculate the residue (raw % speclen) and isY */
short isY = (((raw % speclen)) / halfspeclen);
/* Either 0 or 1 */
int isY = residue / halfspeclen;
int isB = 1 - isY;
// key - ppid
myKey = ppid;
// key = parent peptide ID
myKey = (raw / speclen);
// write to keys
keys[threadIdx.x] = myKey;
/* Get the map element */
myVal = &vals[threadIdx.x];
myVal->bc = isB;
myVal->ibc = intn * isB;
myVal->bc = (1 - isY);
myVal->ibc = intn * (1 - isY);
myVal->yc = isY;
myVal->iyc = intn * isY;
}
Expand All @@ -755,7 +757,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
// number of active threads
int activethds = min(blockDim.x, ends - ion + threadIdx.x + 1);
int iters = hcp::gpu::cuda::s1::log2ceil(activethds);
short iters = hcp::gpu::cuda::s1::log2ceil(activethds);
// is this thread a part of a localized group (and not group leader)
bool isGroup = false;
Expand Down Expand Up @@ -853,37 +855,28 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
// filter by the min shared peaks
if (shpk >= min_shp)
{
// Create a heap cell
dhCell cell;
// get the precomputed log(factorial(x))
double_t h1 = d_lgFact[bcc] + d_lgFact[ycc];
// Fill in the information
cell.hyperscore = h1 + log10f(1 + bycPtr[it].ibc) + log10f(1 + bycPtr[it].iyc) - 6;
double_t hyperscore = d_lgFact[bcc] + d_lgFact[ycc] + log10f(1 + bycPtr[it].ibc) + log10f(1 + bycPtr[it].iyc) - 4;
// hyperscore < 0 means either b- or y- ions were not matched
if (cell.hyperscore > 0)
if (hyperscore > 0)
{
if (cell.hyperscore >= MAX_HYPERSCORE)
cell.hyperscore = MAX_HYPERSCORE - 1;
cell.idxoffset = ixx;
cell.psid = it;
cell.sharedions = shpk;
if (hyperscore >= MAX_HYPERSCORE)
hyperscore = MAX_HYPERSCORE - 1;
// increment local candidate psms by +1
cpss +=1;
// Update the histogram
atomicAdd(&histogram[(int)(cell.hyperscore * 10 + 0.5)], 1);
atomicAdd(&histogram[(int)(hyperscore * 10 + 0.5)], 1);
if (cell.hyperscore > topscores.hyperscore)
if (hyperscore > topscores.hyperscore)
{
topscores.hyperscore = cell.hyperscore;
topscores.psid = cell.psid;
topscores.idxoffset = cell.idxoffset;
topscores.sharedions = cell.sharedions;
topscores.hyperscore = hyperscore;
topscores.psid = it;
topscores.idxoffset = ixx;
topscores.sharedions = shpk;
}
}
}
Expand All @@ -908,23 +901,23 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
}
// local candidate psms
int cpsms_loc = 0;
int l_cpsms = 0;
// sum the local cpsms to get the count
hcp::gpu::cuda::s3::blockSum(cpss, cpsms_loc);
hcp::gpu::cuda::s3::blockSum(cpss, l_cpsms);
// add to the global memory sum
if (!threadIdx.x)
*cpsms = *cpsms + cpsms_loc;
*cpsms = *cpsms + l_cpsms;
// need this to avoid race conditions. Why?
// need this to avoid race conditions
__syncthreads();
// copy histogram to the global memory
for (int ii = threadIdx.x; ii < HISTOGRAM_SIZE; ii+=blockDim.x)
survival[ii] += histogram[ii];
// need this to avoid race conditions. Why?
// need this to avoid race conditions
__syncthreads();
// reset the bycPtr
Expand Down
16 changes: 8 additions & 8 deletions source/core/cuda/superstep4/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -533,18 +533,18 @@ __host__ status_t processResults(Index *index, Queries<spectype_t> *gWorkPtr, in
auto TailFit_ptr = static_cast<void (*)(double *, int *, dhCell *, double *, short)>(&TailFit);

// IMPORTANT: make sure at least 32KB+ shared memory is available to the TailFit kernel
cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));

// the tailfit kernel
hcp::gpu::cuda::s4::TailFit<<<numSpecs, blockSize, KBYTES(32), driver->get_stream(SEARCH_STREAM)>>>(d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, d_evalues, (short)params.min_cpsm);
hcp::gpu::cuda::s4::TailFit<<<numSpecs, blockSize, KBYTES(24), driver->get_stream(SEARCH_STREAM)>>>(d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, d_evalues, (short)params.min_cpsm);
#else
// IMPORTANT: make sure at least 32KB+ shared memory is available to the logWeibullfit kernel
//cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
//cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
// the logWeibullfit kernel
//hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(32), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
//hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(24), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
#endif // TAILFIT
Expand Down Expand Up @@ -777,16 +777,16 @@ __host__ void processResults(double *h_data, float *h_hyp, int *h_cpsms, double
auto TailFit_ptr = static_cast<void (*)(double *, float *, int *, double *, short)>(&TailFit);
// IMPORTANT: make sure at least 32KB+ shared memory is available to the TailFit kernel
cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
// the tailfit kernel
hcp::gpu::cuda::s4::TailFit<<<bsize, blockSize, KBYTES(32), driver->get_stream()>>>(d_data, d_hyp, d_cpsms, d_evalues, (short)params.min_cpsm);
hcp::gpu::cuda::s4::TailFit<<<bsize, blockSize, KBYTES(24), driver->get_stream()>>>(d_data, d_hyp, d_cpsms, d_evalues, (short)params.min_cpsm);
#else
// IMPORTANT: make sure at least 32KB+ shared memory is available to the logWeibullfit kernel
//cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
//cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
// the logWeibullfit kernel
//hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(32), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
//hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(24), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
#endif // TAILFIT
// D2H
Expand Down
2 changes: 1 addition & 1 deletion source/core/dslim_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ status_t DSLIM_QuerySpectrum(Queries<spectype_t> *ss, Index *index, uint_t idxch
double_t h1 = lgfact[bcc] + lgfact[ycc];

/* Fill in the information */
cell.hyperscore = h1 + log10(1 + bycPtr[it].ibc) + log10(1 + bycPtr[it].iyc) - 6;
cell.hyperscore = h1 + log10(1 + bycPtr[it].ibc) + log10(1 + bycPtr[it].iyc) - 4;

/* hyperscore < 0 means either b- or y- ions were not matched */
if (cell.hyperscore > 0)
Expand Down
2 changes: 1 addition & 1 deletion source/core/include/cuda/superstep3/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include "slmerr.h"
#include "dslim.h"

const int SEARCHINSTANCES = 320;
const int SEARCHINSTANCES = 512;

const int SEARCH_STREAM = 0;
const int DATA_STREAM = 1;
Expand Down
4 changes: 2 additions & 2 deletions source/core/include/slm_dsts.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ struct BYC
{
short_t bc; // b ion count
short_t yc; // y ion count
long_t ibc; // b ion intensities
long_t iyc; // y ion intensities
uint_t ibc; // b ion intensities
uint_t iyc; // y ion intensities
};

struct DSLIM_Matrix
Expand Down
6 changes: 3 additions & 3 deletions source/core/msquery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ std::array<int, 2> MSQuery::convertAndprepMS2bin(string *filename)
if constexpr (std::is_same<int, spectype_t>::value)
{
mzs.push_back(std::move(std::atof(mz.c_str()) * params.scale));
intns.push_back(std::move(std::atof(intn.c_str()) * 1000));
intns.push_back(std::move(std::atof(intn.c_str()) * YAXISMULTIPLIER));
}
else
{
Expand Down Expand Up @@ -920,7 +920,7 @@ VOID MSQuery::readMS2spectrum()
}

spectrum.mz[speclen] = (uint_t)((double_t)std::atof(mz.c_str()) * params.scale);
spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * 1000);
spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * YAXISMULTIPLIER);

speclen++;
}
Expand Down Expand Up @@ -1015,7 +1015,7 @@ VOID MSQuery::readMS2spectrum()
}

spectrum.mz[speclen] = (uint_t)((double_t)std::atof(mz.c_str()) * params.scale);
spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * 1000);
spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * YAXISMULTIPLIER);

speclen++;
}
Expand Down

0 comments on commit 45ccae6

Please sign in to comment.