Skip to content

Commit

Permalink
fix for USE_MPI=ON and USE_GPU=ON
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 committed Jul 15, 2023
1 parent 8c757fb commit 9b3c41c
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 89 deletions.
5 changes: 1 addition & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,8 @@ if(OpenMP_FOUND)

endif()

if (USE_MPI AND NOT USE_GPU)
if (USE_MPI)
find_package(MPI)
else()
message(WARNING "USE_MPI and USE_GPU not fully supported together due to filesystem support.")
message(STATUS "=> Setting USE_MPI=OFF")
endif()

if(MPI_FOUND AND USE_MPI)
Expand Down
21 changes: 11 additions & 10 deletions source/apps/argp/argp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ auto getcurrtimeanddate()

auto getcurrpath()
{
// COMPILER VERSION GCC 9.1.0+ required
// COMPILER VERSION GCC 9.1.0+ required
#if __GNUC__ > 9 || (__GNUC__ == 9 && (__GNUC_MINOR__ >= 1))
// COMPILER VERSION GCC 9.1.0+ required for std::filesystem calls
static string_t currpath = std::filesystem::current_path();
Expand Down Expand Up @@ -97,7 +97,7 @@ auto sanitize_dM(T &dM)
//
// structure to store parsed params
//
struct params_t : public argparse::Args
struct params_t : public argparse::Args
{

//
Expand Down Expand Up @@ -155,7 +155,7 @@ struct params_t : public argparse::Args
// base intensity x1000
int &base_int = kwarg("base,base_int", "base noramlized peak intensity for MS/MS data x1000").set_default(1000);

// MS/MS peak cut off ratio
// MS/MS peak cut off ratio
double &cutoff = kwarg("cutoff_ratio", "cutoff peak ratio wrt base intensity (e.g. 1% = 0.01)").set_default(0.01);

// m/z axis resolution
Expand All @@ -172,14 +172,14 @@ struct params_t : public argparse::Args

// LBE distribution policy

// DistPolicy_t requires magic_enum submodule.
// DistPolicy_t requires magic_enum submodule.
DistPolicy_t &lbe_policy = kwarg("policy", "LBE Distribution policy (cyclic, chunk, zigzag)").set_default(DistPolicy_t::cyclic);

// scratch pad memory in MB
int &bufferMBs = kwarg("buff,spad_mem", "buffer (scratch pad) RAM memory in MB (recommended: 2048MB+)").set_default(2048);

// this should be an optional parameter
std::optional<std::vector<std::string>> &mods
std::optional<std::vector<std::string>> &mods
= kwarg("m,mods", "list of variable post-translational modifications (PTMs)").multi_argument();

// do not keep the full database index on GPU
Expand Down Expand Up @@ -229,7 +229,7 @@ void getParams(gParams &params)

#if !defined(ARGP_ONLY)

// COMPILER VERSION GCC 9.1.0+ required
// COMPILER VERSION GCC 9.1.0+ required
#if __GNUC__ > 9 || (__GNUC__ == 9 && (__GNUC_MINOR__ >= 1))
// COMPILER VERSION GCC 9.1.0+ required for std::filesystem calls
std::filesystem::create_directory(parser.workspace);
Expand All @@ -256,6 +256,7 @@ void getParams(gParams &params)
params.gputhreads = parser.gputhreads;
#else
params.gputhreads = 0;
params.useGPU = false;
#endif // USE_GPU

// Get the min peptide length
Expand All @@ -281,7 +282,7 @@ void getParams(gParams &params)
params.dM = parser.deltaM;
sanitize_dM(params.dM);

// Get the min mass
// Get the min mass
params.min_mass = parser.minprecmass;

// Get the max mass
Expand Down Expand Up @@ -325,7 +326,7 @@ void getParams(gParams &params)
params.vModInfo.num_vars = modslist.size();
params.modconditions = std::to_string(params.vModInfo.vmods_per_pep);

// process the strings: AA:MASS.0:NUM
// process the strings: AA:MASS.0:NUM
for (auto md = 0; md < params.vModInfo.num_vars; md++)
{
// for each mod string
Expand Down Expand Up @@ -430,7 +431,7 @@ void printParser()
/* Get the max fragment charge */
printVar(parser.maxz);

// Get the m/z axis resolution
// Get the m/z axis resolution
printVar(parser.resolution);

// Get the fragment mass tolerance
Expand All @@ -439,7 +440,7 @@ void printParser()
// Get the precursor mass tolerance
printVar(parser.deltaM);

// Get the min mass
// Get the min mass
printVar(parser.minprecmass);

// Get the max mass
Expand Down
8 changes: 4 additions & 4 deletions source/core/cuda/superstep2/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const int BATCHSIZE = 20000;

// -------------------------------------------------------------------------------------------- //

namespace hcp
namespace hcp
{

namespace gpu
Expand Down Expand Up @@ -288,10 +288,10 @@ status_t ArraySort(spectype_t *intns, spectype_t *mzs, int *lens, int &idx, int
hcp::gpu::cuda::error_check(hcp::gpu::cuda::H2D(d_lens, lens, count, driver->stream[2]));

// memory for arraynums
hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_arraynums, rawsize, driver->stream[2]));
hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_arraynums, rawsize, driver->stream[2]));

// memory for indices
hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_indices, rawsize, driver->stream[3]));
hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_indices, rawsize, driver->stream[3]));

// memory for processed intensities
hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_m_intns, QALEN * BATCHSIZE, driver->stream[3]));
Expand Down Expand Up @@ -500,7 +500,7 @@ std::array<int, 2> readAndPreprocess(string_t &filename)
// flush to the binary file
MSQuery::flushBinaryFile(&filename, m_mzs, m_intns, rtimes, prec_mz, z, lens, count);
count = 0;
m_idx = 0;
largestspec_loc = 0;
Expand Down
42 changes: 23 additions & 19 deletions source/core/cuda/superstep3/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ extern gParams params;

// -------------------------------------------------------------------------------------------- //

namespace hcp
namespace hcp
{

namespace gpu
Expand Down Expand Up @@ -95,8 +95,8 @@ namespace s3

// -------------------------------------------------------------------------------------------- //

__global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *dQ_idx, int *dQ_minlimits, int *dQ_maxlimits,
uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP, int maxchunk, double *d_survival, int *d_cpsms,
__global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *dQ_idx, int *dQ_minlimits, int *dQ_maxlimits,
uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP, int maxchunk, double *d_survival, int *d_cpsms,
dhCell *d_topscore, int dF, int speclen, int maxmass, int scale, short min_shp, int ixx);

// database search kernel host wrapper
Expand Down Expand Up @@ -193,7 +193,7 @@ void dQueries<T>::H2D(Queries<T> *rhs)
{
auto driver = hcp::gpu::cuda::driver::get_instance();
int chunksize = rhs->numSpecs;

this->numSpecs = rhs->numSpecs;
this->numPeaks = rhs->numPeaks;

Expand Down Expand Up @@ -254,7 +254,7 @@ __host__ status_t initialize()
static auto h_lgfact = hcp::utils::lgfact<hcp::utils::maxshp>();

// copy to CUDA constant arrays
hcp::gpu::cuda::error_check(cudaMemcpyToSymbol(d_lgFact, &h_lgfact.val, sizeof(double_t) * hcp::utils::maxshp));
hcp::gpu::cuda::error_check(cudaMemcpyToSymbol(d_lgFact, &h_lgfact.val, sizeof(double_t) * hcp::utils::maxshp));

return SLM_SUCCESS;

Expand Down Expand Up @@ -317,7 +317,7 @@ std::pair<BYC *, int>& getBYC(int chunksize)
void freeBYC()
{
auto driver = hcp::gpu::cuda::driver::get_instance();

auto pBYC = getBYC();

auto d_BYC = std::get<0>(pBYC);
Expand Down Expand Up @@ -354,7 +354,7 @@ dScores *& getScorecard()
void freeScorecard()
{
auto driver = hcp::gpu::cuda::driver::get_instance();

auto&& d_Scores = getScorecard();

if (d_Scores)
Expand All @@ -369,7 +369,7 @@ void freeScorecard()
__host__ dQueries<spectype_t> *& getdQueries()
{
static thread_local dQueries<spectype_t> *dqueries = nullptr;

if (!dqueries)
dqueries = new dQueries<spectype_t>();

Expand Down Expand Up @@ -405,7 +405,7 @@ __host__ status_t search(Queries<spectype_t> *gWorkPtr, Index *index, uint_t idx
static thread_local auto driver = hcp::gpu::cuda::driver::get_instance();

dIndex *d_Index = nullptr;

if (params.gpuindex)
d_Index = hcp::gpu::cuda::s1::getdIndex(index);

Expand Down Expand Up @@ -460,7 +460,7 @@ __host__ status_t search(Queries<spectype_t> *gWorkPtr, Index *index, uint_t idx
hcp::gpu::cuda::error_check(hcp::gpu::cuda::H2D(d_mat->iA, iAPtr, iAsize, driver->stream[DATA_STREAM]));
}

// copy the At rows to device
// copy the At rows to device
d_mat->bA = hcp::gpu::cuda::s1::getbA();
uint_t bAsize = ((uint_t)(params.max_mass * params.scale)) + 1;

Expand All @@ -480,15 +480,19 @@ __host__ status_t search(Queries<spectype_t> *gWorkPtr, Index *index, uint_t idx
}
}

#ifdef USE_MPI

if (params.nodes > 1)
{
#if defined (USE_MPI)
static bool once = [](){ std::cout << "WARNING: Experimental support for GPU+MPI only. Problems expected." << std::endl; return true;}();
status = hcp::gpu::cuda::s4::getIResults(index, gWorkPtr, gpucurrSpecID, CandidatePSMS);
else
#else
std::cerr << "ABORT: params.nodes > 1 without MPI. Build with -DUSE_MPI=ON" << std::endl;
exit(-1);
#endif // USE_MPI
}
else
// combine the results
status = hcp::gpu::cuda::s4::processResults(index, gWorkPtr, gpucurrSpecID);
#endif // USE_MPI

hcp::gpu::cuda::s3::reset_dScores();

Expand Down Expand Up @@ -650,8 +654,8 @@ __global__ void resetdScores(double *survival, int *cpsms, dhCell *topscore)
// -------------------------------------------------------------------------------------------- //
__global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *dQ_idx, int *dQ_minlimits,
int *dQ_maxlimits, uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP,
int maxchunk, double *d_survival, int *d_cpsms, dhCell *d_topscore, int dF,
int *dQ_maxlimits, uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP,
int maxchunk, double *d_survival, int *d_cpsms, dhCell *d_topscore, int dF,
int speclen, int maxmass, int scale, short min_shp, int ixx)
{
BYC *bycPtr = &bycP[blockIdx.x * maxchunk];
Expand Down Expand Up @@ -701,7 +705,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
auto data = d_iA + d_bA[bin];
int n = d_bA[bin + 1] - d_bA[bin];
/* If no ions in the bin */
if (n < 1)
continue;
Expand Down Expand Up @@ -750,7 +754,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
__syncthreads();
//
// reduce the BYC elements to avoid
// reduce the BYC elements to avoid
// race conditions and locking
//
Expand Down Expand Up @@ -853,7 +857,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
ushort_t shpk = bcc + ycc;
// filter by the min shared peaks
if (shpk >= min_shp)
if (shpk >= min_shp)
{
// get the precomputed log(factorial(x))
Expand Down
12 changes: 9 additions & 3 deletions source/core/dslim_scproc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,19 @@ status_t DSLIM_DistScoreManager()
#endif // USE_TIMEMORY

// use GPU if available
#if defined (USE_GPU)
if (params.useGPU)
{
#if defined (USE_GPU)
static bool once = [](){ std::cout << "WARNING: Experimental support for GPU+MPI only. Problems expected." << std::endl; return true;}();
status = ScoreHandle->GPUCombineResults();
else
#else
status = ScoreHandle->CombineResults();
// should never logically reach here unless maliciously hacked into.
std::cerr << "ABORT: params.useGPU=true not available. Build with -DUSE_GPU=ON." << std::endl;
exit(-1);
#endif // USE_GPU
}
else
status = ScoreHandle->CombineResults();

#if defined (USE_TIMEMORY)
merge_instr.stop();
Expand Down
10 changes: 5 additions & 5 deletions source/core/ms2prep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ status_t synchronize()

//
// FUNCTION: get_instance
//
//
MSQuery **& get_instance()
{
static MSQuery** ptrs = new MSQuery*[queryfiles.size()];
Expand All @@ -83,7 +83,7 @@ MSQuery **& get_instance()

//
// FUNCTION: initialize
//
//
status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
{
status_t status = SLM_SUCCESS;
Expand Down Expand Up @@ -126,7 +126,7 @@ status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
ms2local[0] = params.myid;

// rest of the files in cyclic order
std::generate(std::begin(ms2local) + 1, std::end(ms2local),
std::generate(std::begin(ms2local) + 1, std::end(ms2local),
[n=params.myid] () mutable { return n += params.nodes; });

#if defined(USE_GPU)
Expand All @@ -143,7 +143,7 @@ status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)

//
// lambda function to initialize the MSQuery instances
//
//
auto workerthread = [&](bool gpu)
{
auto loc_fid = 0;
Expand Down Expand Up @@ -175,7 +175,7 @@ status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
hcp::gpu::cuda::s2::preprocess(ptrs[loc_fid], queryfiles[loc_fid], loc_fid);
else
ptrs[loc_fid]->initialize(&queryfiles[loc_fid], loc_fid);

// archive the index if using MPI
#ifdef USE_MPI
ptrs[loc_fid]->archive(loc_fid);
Expand Down
Loading

0 comments on commit 9b3c41c

Please sign in to comment.