Skip to content

Commit

Permalink
Faster CPU algorithms. Easier vector library. Cache for frequent CUDA…
Browse files Browse the repository at this point in the history
… API

calls.
  • Loading branch information
ap-hynninen committed Dec 2, 2016
1 parent 2ac2096 commit acb7958
Show file tree
Hide file tree
Showing 13 changed files with 1,027 additions and 1,129 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ CUDAROOT = $(subst /bin/,,$(dir $(shell which $(CUDAC))))

CFLAGS = -I${CUDAROOT}/include -std=c++11 $(DEFS) $(OPTLEV) -march=native

CUDA_CFLAGS = -I${CUDAROOT}/include $(OPTLEV) -Xptxas -dlcm=ca -lineinfo $(GENCODE_FLAGS) --resource-usage -Xcompiler "$(CUDA_CCFLAGS)" $(DEFS) -D_FORCE_INLINES
CUDA_CFLAGS = -I${CUDAROOT}/include -std=c++11 $(OPTLEV) -Xptxas -dlcm=ca -lineinfo $(GENCODE_FLAGS) --resource-usage -Xcompiler "$(CUDA_CCFLAGS)" $(DEFS) -D_FORCE_INLINES

ifeq ($(OS),osx)
CUDA_LFLAGS = -L$(CUDAROOT)/lib
Expand Down
98 changes: 98 additions & 0 deletions src/LRUCache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/******************************************************************************
MIT License
Copyright (c) 2016 Antti-Pekka Hynninen
Copyright (c) 2016 NVIDIA
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*******************************************************************************/

#include <utility>
#include <list>
#include <unordered_map>

using namespace std;

//
// Simple LRU cache implementation
//
template <typename key_type, typename value_type>
class LRUCache {
private:

struct ValueIterator {
value_type value;
typename list<key_type>::iterator it;
};

// Size of the cache
const size_t capacity;

// Value that is returned when the key is not found
const value_type null_value;

// Double linked list of keys. Oldest is at the back
list<key_type> keys;

// Cache: (hash table)
// key = key
// value = {value, pointer to linked list}
unordered_map<key_type, ValueIterator> cache;

public:

LRUCache(const size_t capacity, const value_type null_value) : capacity(capacity), null_value(null_value) {}

value_type get(key_type key) {
auto it = cache.find(key);
if (it == cache.end()) return null_value;
touch(it);
return it->second.value;
}

void set(key_type key, value_type value) {
auto it = cache.find(key);
if (it != cache.end()) {
// key found
it->second.value = value;
touch(it);
} else {
// key not found
if (cache.size() == capacity) {
key_type oldest_key = keys.back();
keys.pop_back();
cache.erase( cache.find(oldest_key) );
}
keys.push_front(key);
ValueIterator vi;
vi.value = value;
vi.it = keys.begin();
pair<key_type, ValueIterator> boo(key, vi);
cache.insert(boo);
}
}

private:

void touch(typename unordered_map<key_type, ValueIterator>::iterator it) {
keys.erase(it->second.it);
keys.push_front(it->first);
it->second.it = keys.begin();
}
};
57 changes: 33 additions & 24 deletions src/cutt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ SOFTWARE.
#include <cuda.h>
#include <list>
#include <unordered_map>
#include <unordered_set>
#include "CudaUtils.h"
#include "cuttplan.h"
#include "cuttkernel.h"
#include "cuttTimer.h"
#include "cutt.h"
// #include <chrono>

// Hash table to store the plans
static std::unordered_map< cuttHandle, cuttPlan_t* > planStorage;
Expand All @@ -39,7 +39,22 @@ static std::unordered_map< cuttHandle, cuttPlan_t* > planStorage;
static cuttHandle curHandle = 0;

// Table of devices that have been initialized
static std::unordered_set<int> devicesReady;
static std::unordered_map<int, cudaDeviceProp> deviceProps;

// Checks prepares device if it's not ready yet and returns device properties
// Also sets shared memory configuration
void getDeviceProp(int& deviceID, cudaDeviceProp &prop) {
cudaCheck(cudaGetDevice(&deviceID));
auto it = deviceProps.find(deviceID);
if (it == deviceProps.end()) {
// Get device properties and store it for later use
cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
cuttKernelSetSharedMemConfig();
deviceProps.insert({deviceID, prop});
} else {
prop = it->second;
}
}

cuttResult cuttPlanCheckInput(int rank, int* dim, int* permutation, size_t sizeofType) {
// Check sizeofType
Expand Down Expand Up @@ -84,11 +99,10 @@ cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, si
// Check that the current handle is available (it better be!)
if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR;

// Get all possible ways tensor can be transposed
// Prepare device
int deviceID;
cudaCheck(cudaGetDevice(&deviceID));
cudaDeviceProp prop;
cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
getDeviceProp(deviceID, prop);

// Reduce ranks
std::vector<int> redDim;
Expand All @@ -113,8 +127,16 @@ cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, si
gpuRangeStart("createPlans");
#endif

if (!createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(),
sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
// std::chrono::high_resolution_clock::time_point plan_start;
// plan_start = std::chrono::high_resolution_clock::now();

if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(),
sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR;

// std::chrono::high_resolution_clock::time_point plan_end;
// plan_end = std::chrono::high_resolution_clock::now();
// double plan_duration = std::chrono::duration_cast< std::chrono::duration<double> >(plan_end - plan_start).count();
// printf("createPlans took %lf ms\n", plan_duration*1000.0);

#ifdef ENABLE_NVTOOLS
gpuRangeStop();
Expand Down Expand Up @@ -177,17 +199,10 @@ cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutat
// Check that the current handle is available (it better be!)
if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR;

// Get all possible ways tensor can be transposed
// Prepare device
int deviceID;
cudaCheck(cudaGetDevice(&deviceID));
cudaDeviceProp prop;
cudaCheck(cudaGetDeviceProperties(&prop, deviceID));

// Set shared memory configuration if necessary
if (!devicesReady.count(deviceID)) {
cuttKernelSetSharedMemConfig();
devicesReady.insert(deviceID);
}
getDeviceProp(deviceID, prop);

// Reduce ranks
std::vector<int> redDim;
Expand All @@ -204,8 +219,8 @@ cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutat
// Create plans from non-reduced ranks
// if (!createPlans(rank, dim, permutation, sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
#else
if (!createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(),
sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(),
sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR;
#endif

// // Count cycles
Expand Down Expand Up @@ -290,12 +305,6 @@ cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata) {
cudaCheck(cudaGetDevice(&deviceID));
if (deviceID != plan.deviceID) return CUTT_INVALID_DEVICE;

// Set shared memory configuration if necessary
if (!devicesReady.count(deviceID)) {
cuttKernelSetSharedMemConfig();
devicesReady.insert(deviceID);
}

if (!cuttKernel(plan, idata, odata)) return CUTT_INTERNAL_ERROR;
return CUTT_SUCCESS;
}
Loading

0 comments on commit acb7958

Please sign in to comment.