diff --git a/src/cuttplan.cpp b/src/cuttplan.cpp index 0bedd96..efd4e8f 100644 --- a/src/cuttplan.cpp +++ b/src/cuttplan.cpp @@ -178,6 +178,8 @@ TensorSplit::TensorSplit() { volMkBar = 0; sizeMbar = 0; volMbar = 0; + volMmkInCont = 0; + volMmkOutCont = 0; numSplit = 1; splitRank = -1; splitDim = 0; @@ -187,8 +189,9 @@ TensorSplit::TensorSplit() { void TensorSplit::print() { printf("sizeMm %d sizeMk %d sizeMmk %d sizeMbar %d sizeMkBar %d\n", sizeMm, sizeMk, sizeMmk, sizeMbar, sizeMkBar); - printf("volMm %d volMk %d volMmk %d volMbar %d volMkBar %d volMmBar %d\n", - volMm, volMk, volMmk, volMbar, volMkBar, volMmBar); + printf("volMm %d volMk %d volMmk %d volMbar %d volMkBar %d\n", + volMm, volMk, volMmk, volMbar, volMkBar); + printf("volMmkInCont %d volMmkOutCont %d\n", volMmkInCont, volMmkOutCont); if (method == GeneralSplit) printf("numSplit %d splitRank %d\n", numSplit, splitRank); } @@ -229,42 +232,100 @@ void TensorSplit::update(const int sizeMm_in, const int sizeMk_in, const int ran vol *= dim[i]; } + sizeMbar = rank - sizeMmk; + volMbar = vol/volMmk; + if (splitRank >= 0) { splitDim = dim[splitRank]; volMmkUnsplit = volMmk / splitDim; } - volMmBar = volMmk / volMk; - sizeMbar = rank - sizeMmk; - volMbar = vol/volMmk; + std::vector isMmk(rank, false); + for (int i=0;i < rank;i++) { + if (i < sizeMm) { + isMmk[i] = true; + } + if (i < sizeMk) { + int pi = permutation[i]; + isMmk[pi] = true; + } + } + + volMmkInCont = 1; + for (int i=0;i < rank;i++) { + if (!isMmk[i]) break; + if (i == splitRank) { + volMmkInCont *= splitDim / numSplit + (splitDim % numSplit > 0); + break; + } else { + volMmkInCont *= dim[i]; + } + } + + volMmkOutCont = 1; + for (int i=0;i < rank;i++) { + int pi = permutation[i]; + if (!isMmk[pi]) break; + if (pi == splitRank) { + volMmkOutCont *= splitDim / numSplit + (splitDim % numSplit > 0); + break; + } else { + volMmkOutCont *= dim[pi]; + } + } + } bool operator==(const TensorSplit& lhs, const TensorSplit& rhs) { if (lhs.method != rhs.method) return false; - if (lhs.method == General || lhs.method == GeneralSplit) { - return - (lhs.sizeMmk == rhs.sizeMmk) && - (lhs.volMmk == rhs.volMmk) && - (lhs.sizeMbar == rhs.sizeMbar) && - (lhs.volMbar == rhs.volMbar) && - // (lhs.numActiveBlock == rhs.numActiveBlock) && - (lhs.numSplit == rhs.numSplit); - } else { + if (lhs.method == Trivial) return true; + + if (lhs.method == TiledSingleRank) { return - (lhs.sizeMm == rhs.sizeMm) && (lhs.volMm == rhs.volMm) && - (lhs.sizeMk == rhs.sizeMk) && (lhs.volMk == rhs.volMk) && - (lhs.sizeMmk == rhs.sizeMmk) && - (lhs.volMmk == rhs.volMmk) && - (lhs.sizeMkBar == rhs.sizeMkBar) && + (lhs.volMbar == rhs.volMbar); + } + + if (lhs.method == TiledLeadVolSame) { + return + (lhs.volMm == rhs.volMm) && (lhs.volMkBar == rhs.volMkBar) && - (lhs.sizeMbar == rhs.sizeMbar) && - (lhs.volMbar == rhs.volMbar) && - // (lhs.numActiveBlock == rhs.numActiveBlock) && - (lhs.numSplit == rhs.numSplit); + (lhs.volMbar == rhs.volMbar); + } + + if (lhs.method == General || lhs.method == GeneralSplit) { + return + (lhs.volMmkInCont == rhs.volMmkInCont) && + (lhs.volMmkOutCont == rhs.volMmkOutCont) && + (lhs.volMmk == rhs.volMmk) && + (lhs.volMbar == rhs.volMbar); } + + // if (lhs.method == General || lhs.method == GeneralSplit) { + // return + // (lhs.sizeMmk == rhs.sizeMmk) && + // (lhs.volMmk == rhs.volMmk) && + // (lhs.sizeMbar == rhs.sizeMbar) && + // (lhs.volMbar == rhs.volMbar) && + // // (lhs.numActiveBlock == rhs.numActiveBlock) && + // (lhs.numSplit == rhs.numSplit); + // } else { + // return + // (lhs.sizeMm == rhs.sizeMm) && + // (lhs.volMm == rhs.volMm) && + // (lhs.sizeMk == rhs.sizeMk) && + // (lhs.volMk == rhs.volMk) && + // (lhs.sizeMmk == rhs.sizeMmk) && + // (lhs.volMmk == rhs.volMmk) && + // (lhs.sizeMkBar == rhs.sizeMkBar) && + // (lhs.volMkBar == rhs.volMkBar) && + // (lhs.sizeMbar == rhs.sizeMbar) && + // (lhs.volMbar == rhs.volMbar) && + // // (lhs.numActiveBlock == rhs.numActiveBlock) && + // (lhs.numSplit == rhs.numSplit); + // } } // @@ -398,6 +459,16 @@ size_t TensorSplit::shmemAlloc(int sizeofType) const { return vol; } +// +// Returns true if the plan with TensorSplit ts already exists +// +bool planExists(TensorSplit& ts, std::list& plans) { + for (auto it=plans.begin();it != plans.end();it++) { + if (it->tensorSplit == ts) return true; + } + return false; +} + bool createTrivialPlans(const int rank, const int* dim, const int* permutation, const size_t sizeofType, cudaDeviceProp& prop, std::list& plans) { @@ -407,7 +478,7 @@ bool createTrivialPlans(const int rank, const int* dim, const int* permutation, ts.update(1, 1, rank, dim, permutation); LaunchConfig lc; int numActiveBlock = cuttKernelLaunchConfiguration(sizeofType, ts, prop, lc); - if (numActiveBlock > 0) { + if (numActiveBlock > 0 && !planExists(ts, plans)) { cuttPlan_t plan; if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; plans.push_back(plan); @@ -426,7 +497,7 @@ bool createTiledSingleRankPlans(const int rank, const int* dim, const int* permu ts.update(1, 1, rank, dim, permutation); LaunchConfig lc; int numActiveBlock = cuttKernelLaunchConfiguration(sizeofType, ts, prop, lc); - if (numActiveBlock > 0) { + if (numActiveBlock > 0 && !planExists(ts, plans)) { cuttPlan_t plan; if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; plans.push_back(plan); @@ -454,7 +525,7 @@ bool createTiledLeadVolSamePlans(const int rank, const int* dim, const int* perm } LaunchConfig lc; int numActiveBlock = cuttKernelLaunchConfiguration(sizeofType, ts, prop, lc); - if (numActiveBlock > 0) { + if (numActiveBlock > 0 && !planExists(ts, plans)) { cuttPlan_t plan; if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; plans.push_back(plan); @@ -467,8 +538,8 @@ bool createTiledLeadVolSamePlans(const int rank, const int* dim, const int* perm bool createGeneralPlans(const int rank, const int* dim, const int* permutation, const size_t sizeofType, cudaDeviceProp& prop, std::list& plans) { - // Stores TensorSplits that have already been added (in order to avoid duplicates) - std::vector tsAdded; + // // Stores TensorSplits that have already been added (in order to avoid duplicates) + // std::vector tsAdded; LaunchConfig lc; for (int numMm=1;numMm < rank;numMm++) { @@ -476,45 +547,88 @@ bool createGeneralPlans(const int rank, const int* dim, const int* permutation, TensorSplit ts; ts.method = General; ts.update(numMm, numMk, rank, dim, permutation); + // Amount of shared memory required + int shmemsize = ts.shmemAlloc(sizeofType); + if (shmemsize > prop.sharedMemPerBlock) { + // Does not fit into shared memory, need to split + ts.method = GeneralSplit; + // Minimum size of split dim allowed + const int splitDimMin = 4; + // Split the largest dimension + int maxDim = 0; + for (int i=0;i < ts.sizeMm;i++) { + if (dim[i] > maxDim) { + maxDim = dim[i]; + ts.splitRank = i; + } + } + for (int i=0;i < ts.sizeMk;i++) { + int pi = permutation[i]; + if (dim[pi] > maxDim) { + maxDim = dim[pi]; + ts.splitRank = pi; + } + } + // + ts.update(numMm, numMk, rank, dim, permutation); + int minNumSplit = (ts.splitDim*ts.volMmkUnsplit*sizeofType - 1)/prop.sharedMemPerBlock + 1; + int maxNumSplit = std::max(minNumSplit, std::min(ts.splitDim/splitDimMin, minNumSplit + 60)); + int bestVal = 0; + int bestNumSplit = 0; + for (ts.numSplit=minNumSplit;ts.numSplit <= maxNumSplit;ts.numSplit++) { + int numActiveBlock = cuttKernelLaunchConfiguration(sizeofType, ts, prop, lc); + // int val = lc.numthread.x*numActiveBlock; + int val = ts.volMmkUsed()*numActiveBlock; + // printf("%d volMmk %d nAB %d val %d nthread %d nreg %d\n", + // ts.numSplit, ts.volMmkUsed(), numActiveBlock, val, + // lc.numthread.x, lc.numRegStorage); + if (bestVal < val) { + bestVal = val; + bestNumSplit = ts.numSplit; + } + } + ts.numSplit = bestNumSplit; + if (ts.numSplit == 0) break; + } int numActiveBlock = cuttKernelLaunchConfiguration(sizeofType, ts, prop, lc); // If we can't fit to device, break out from inner loop if (numActiveBlock == 0) break; - // Do not add multiple copies of the same plan - bool multiple = false; - for (int i=0;i < tsAdded.size();i++) { - if (tsAdded[i] == ts) { - multiple = true; - break; - } + // // Do not add multiple copies of the same plan + // bool multiple = false; + // for (int i=0;i < tsAdded.size();i++) { + // if (tsAdded[i] == ts) { + // multiple = true; + // break; + // } + // } + // if (multiple) continue; + // tsAdded.push_back(ts); + if (!planExists(ts, plans)) { + cuttPlan_t plan; + if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; + plans.push_back(plan); } - if (multiple) continue; - tsAdded.push_back(ts); - cuttPlan_t plan; - if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; - plans.push_back(plan); } } return true; } +/* bool createGeneralSplitPlans(const int rank, const int* dim, const int* permutation, const size_t sizeofType, cudaDeviceProp& prop, std::list& plans) { // Minimum size of split dim allowed const int splitDimMin = 4; - // Stores TensorSplits that have already been added (in order to avoid duplicates) - std::vector tsAdded; + // // Stores TensorSplits that have already been added (in order to avoid duplicates) + // std::vector tsAdded; LaunchConfig lc; for (int numMm=1;numMm < rank;numMm++) { for (int numMk=1;numMk < rank;numMk++) { - // if (numMm != 1 || numMk != 3) continue; TensorSplit ts; ts.method = GeneralSplit; - ts.splitRank = 0; - ts.numSplit = 1; ts.update(numMm, numMk, rank, dim, permutation); // Amount of shared memory required int shmemsize = ts.shmemAlloc(sizeofType); @@ -563,24 +677,29 @@ bool createGeneralSplitPlans(const int rank, const int* dim, const int* permutat int numActiveBlock = cuttKernelLaunchConfiguration(sizeofType, ts, prop, lc); // If we can't fit to device, break out from inner loop if (numActiveBlock == 0) break; - // Do not add multiple copies of the same plan - bool multiple = false; - for (int i=0;i < tsAdded.size();i++) { - if (tsAdded[i] == ts) { - multiple = true; - break; - } + // // Do not add multiple copies of the same plan + // bool multiple = false; + // for (int i=0;i < tsAdded.size();i++) { + // if (tsAdded[i] == ts) { + // multiple = true; + // break; + // } + // } + // if (multiple) continue; + // tsAdded.push_back(ts); + if (!planExists(ts, plans)) { + printf("GENERAL SPLIT\n"); + exit(1); + cuttPlan_t plan; + if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; + plans.push_back(plan); } - if (multiple) continue; - tsAdded.push_back(ts); - cuttPlan_t plan; - if (!plan.setup(rank, dim, permutation, sizeofType, prop, ts)) return false; - plans.push_back(plan); } } return true; } +*/ // // Create all possible plans @@ -592,13 +711,12 @@ bool createPlans(const int rank, const int* dim, const int* permutation, const s if (!createTiledLeadVolSamePlans(rank, dim, permutation, sizeofType, prop, plans)) return false; if (!createTiledSingleRankPlans(rank, dim, permutation, sizeofType, prop, plans)) return false; if (!createGeneralPlans(rank, dim, permutation, sizeofType, prop, plans)) return false; - if (!createGeneralSplitPlans(rank, dim, permutation, sizeofType, prop, plans)) return false; + // if (!createGeneralSplitPlans(rank, dim, permutation, sizeofType, prop, plans)) return false; return true; } bool operator>(const cuttPlan_t& lhs, const cuttPlan_t& rhs) { - // return (lhs.numMemAccess < rhs.numMemAccess); const TensorSplit& lts = lhs.tensorSplit; const TensorSplit& rts = rhs.tensorSplit; diff --git a/src/cuttplan.h b/src/cuttplan.h index 7fdf34e..9dbb383 100644 --- a/src/cuttplan.h +++ b/src/cuttplan.h @@ -61,13 +61,17 @@ class TensorSplit { int sizeMkBar; int volMkBar; - int volMmBar; - // Remaining volume int sizeMbar; int volMbar; - // Number of splits, for GeneralSplitInRank and GeneralSplitOutRank methods + // For General and GeneralSplit methods: + // Amount of contigious volume + int volMmkInCont; + int volMmkOutCont; + + // For GeneralSplit method: + // Number of splits int numSplit; // Rank that is split @@ -130,7 +134,7 @@ class cuttPlan_t { TensorSplit tensorSplit; - // Number of active thread blocks, for General method + // Number of active thread blocks int numActiveBlock; int cuDimMk;