Skip to content

Commit

Permalink
Merge branch 'remove-trainable-layers-ray' into 'main'
Browse files Browse the repository at this point in the history
Remove all trainable layers using legacy tensor

See merge request dl/hugectr/hugectr!1470
  • Loading branch information
minseokl committed Sep 27, 2023
2 parents d16fac7 + 8d0a2b2 commit 398d8e0
Show file tree
Hide file tree
Showing 51 changed files with 374 additions and 10,390 deletions.
106 changes: 3 additions & 103 deletions HugeCTR/include/layers/batch_norm_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,6 @@ class BatchNormLayer : public TrainableLayer<T, true> {
using Base = TrainableLayer<T, true>;
using WeightType = typename Base::WeightType;

/*
* stores the references to the input tensors of this layer.
*/
Tensors2<T> in_tensors_;
/*
* stores the references to the output tensors of this layer.
*/
Tensors2<T> out_tensors_;

public:
/**
* BatchNorm parameters
Expand All @@ -51,20 +42,14 @@ class BatchNormLayer : public TrainableLayer<T, true> {

/**
* Ctor of BatchNormLayer.
* @param weight_buff weight buffer for internal gamma/beta tensors
* @param wgrad_buff gradient buffer for internal gamma/beta tensors
* @param in_tensor the input tensor
* @param out_tensor the output tensor which has the same dim with in_tensor
* @param params BatchNorm parameters
* @param cudnn_handle cuDNN handle created externally
* @param device_id the id of GPU where this layer belongs
*/
BatchNormLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
const std::shared_ptr<BufferBlock2<WeightType>>& weight_buff,
const std::shared_ptr<BufferBlock2<WeightType>>& wgrad_buff,
const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor, const Params& params,
const std::shared_ptr<GPUResource>& gpu_resource,
BatchNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
const Params& params, const std::shared_ptr<GPUResource>& gpu_resource,
std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
~BatchNormLayer() override;

Expand All @@ -89,91 +74,6 @@ class BatchNormLayer : public TrainableLayer<T, true> {
*/
std::string get_no_trained_params_in_string() override;

std::vector<TensorBag2> get_tensors_for_non_trainable_params() override;

private:
/**
* A method of defining how gamma and beta are initialized.
* Gamma is initialized to 1s while Beta is 0ed.
* Override this function to change the initialization behavior.
*/
std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;

const Params params_;
const cudnnBatchNormMode_t mode_;
cudnnTensorDescriptor_t in_out_desc_;
cudnnTensorDescriptor_t gamma_beta_desc_;

// these four pointers are just for convenience
// they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_
Tensor2<float> gamma_;
Tensor2<float> beta_;
Tensor2<float> gamma_grad_;
Tensor2<float> beta_grad_;

// these tensors are internal only managed by smart ptrs
Tensor2<float> result_running_mean_;
Tensor2<float> result_running_var_;
Tensor2<float> result_save_mean_;
Tensor2<float> result_save_inv_var_;

// host arCore23Temp to do device-to-host copy for mean and var
Tensor2<float> h_result_running_mean_;
Tensor2<float> h_result_running_var_;
};

/**
* BatchNorm layer based on cuDNN
*/
template <typename T>
class Core23TempBatchNormLayer : public Core23TempTrainableLayer<T, true> {
using Base = Core23TempTrainableLayer<T, true>;
using WeightType = typename Base::WeightType;

public:
/**
* BatchNorm parameters
*/
struct Params {
double factor; /**< moving average computation factor*/
double eps; /**< small value to avoid divide-by-zero error*/
};

/**
* Ctor of Core23TempBatchNormLayer.
* @param in_tensor the input tensor
* @param out_tensor the output tensor which has the same dim with in_tensor
* @param params BatchNorm parameters
* @param cudnn_handle cuDNN handle created externally
* @param device_id the id of GPU where this layer belongs
*/
Core23TempBatchNormLayer(
const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params,
const std::shared_ptr<GPUResource>& gpu_resource,
std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
~Core23TempBatchNormLayer() override;

void initialize() override;

/**
* A method of implementing the forward pass of BatchNorm
* @param stream CUDA stream where the forward propagation is executed
*/
void fprop(bool is_train) override;

/**
* A method of implementing the forward pass of BatchNorm
* @param stream CUDA stream where the forward propagation is executed
*/
void bprop() override;

/**
* A method to get mean and variance which are needed for inference as string.
* Session is in charge of calling this method and store the contensts to file.
* See Session::download_params_to_file() for more detailed information.
*/
std::string get_no_trained_params_in_string() override;

std::vector<core23::Tensor> get_non_trainable_params_as_tensors() override;

private:
Expand Down Expand Up @@ -202,7 +102,7 @@ class Core23TempBatchNormLayer : public Core23TempTrainableLayer<T, true> {
core23::Tensor result_save_mean_;
core23::Tensor result_save_inv_var_;

// host arCore23Temp to do device-to-host copy for mean and var
// host ar to do device-to-host copy for mean and var
core23::Tensor h_result_running_mean_;
core23::Tensor h_result_running_var_;
};
Expand Down
78 changes: 2 additions & 76 deletions HugeCTR/include/layers/fully_connected_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,7 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT};
cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT};

/*
* stores the references to the input tensors of this layer.
*/
Tensors2<float> in_tensors_;
/*
* stores the references to the output tensors of this layer.
*/
Tensors2<float> out_tensors_;

Tensors2<float>& get_in_tensors(bool is_train) { return in_tensors_; }
std::vector<core23::Tensor>& get_in_tensors(bool is_train) { return this->input_tensors_; }

public:
/**
Expand All @@ -71,16 +62,12 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
* Only two kinds of tensor formats are supported:
* (1) weight, input, output, wgrad are all in row-major.
* (2) weight, input, output, wgrad are all in column-major.
* @param weight_buff: stores the weight tensor
* @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
* @param in_tensor: stores the input tensor
* @param out_tensor: stores the output tensor
* @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH
* (col-major)
*/
FullyConnectedLayer(const std::shared_ptr<BufferBlock2<float>>& weight_buff,
const std::shared_ptr<BufferBlock2<float>>& wgrad_buff,
const Tensor2<float>& in_tensor, const Tensor2<float>& out_tensor,
FullyConnectedLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
bool enable_tf32_compute,
std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
Expand All @@ -97,65 +84,4 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
};

template <typename T>
class Core23TempFullyConnectedLayer;

/**
* @brief
* This class implements the fully connected layer.
*/
template <>
class Core23TempFullyConnectedLayer<float> : public Core23TempTrainableLayer<float> {
private:
const bool use_mixed_precision_{false};
const bool enable_tf32_compute_{false};
// Optimized cublasGemmEx algorithm selection
cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT};
cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT};
cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT};

std::vector<core23::Tensor>& get_in_tensors(bool is_train) { return this->input_tensors_; }

public:
/**
* forward pass
*/
void fprop(bool is_train) final;
/**
* backward pass
*/
void bprop() final;
/*
* algorithm search for cublasGemmEx
*/
void search_algorithm() final;
/**
* This is the constructor of the Core23TempFullyConnectedLayer.
* It will check whether the format combination of all tensors is supported or not.
* Only two kinds of tensor formats are supported:
* (1) weight, input, output, wgrad are all in row-major.
* (2) weight, input, output, wgrad are all in column-major.
* @param in_tensor: stores the input tensor
* @param out_tensor: stores the output tensor
* @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH
* (col-major)
*/
Core23TempFullyConnectedLayer(
const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
bool enable_tf32_compute,
std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer& C) = delete;
Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&);

private:
/*
* initializers for this layer.
*/
std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
};

} // namespace HugeCTR
87 changes: 3 additions & 84 deletions HugeCTR/include/layers/fully_connected_layer_half.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,10 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
cublasGemmAlgo_t balgo_k_;
cublasGemmAlgo_t balgo_x_;

/*
* stores the references to the input tensors of this layer.
*/
Tensor2<__half> bottom_tensor_;

/*
* stores the references to the output tensors of this layer.
*/
Tensor2<__half> top_tensor_;

/*
* stores the references to the output tensors of GEMM.
*/
Tensor2<__half> identity_tensor_;
core23::Tensor identity_tensor_;

/*
* initializers for this layer.
Expand All @@ -61,7 +51,7 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;

Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; }
core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; }

public:
/**
Expand All @@ -87,87 +77,16 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
* Only two kinds of tensor formats are supported:
* (1) weight, input, output, wgrad are all in row-major.
* (2) weight, input, output, wgrad are all in column-major.
* @param weight_buff: stores the weight tensor
* @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
* @param bottom_tensor: stores the tensor from bottom layer
* @param top_tensor: stores the tensor to top layer
* @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
* (col-major)
*/
FullyConnectedLayer(const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor,
FullyConnectedLayer(const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
const std::shared_ptr<GPUResource>& gpu_resource,
std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
FullyConnectedLayer(const FullyConnectedLayer&) = delete;
FullyConnectedLayer& operator=(const FullyConnectedLayer&);
};

/**
* @brief
* This class implements the fully connected layer.
*/
template <>
class Core23TempFullyConnectedLayer<__half> : public Core23TempTrainableLayer<__half> {
// Optimized cublasGemmEx algorithm selection
cublasGemmAlgo_t falgo_b_;
cublasGemmAlgo_t falgo_k_;
cublasGemmAlgo_t balgo_b_;
cublasGemmAlgo_t balgo_k_;
cublasGemmAlgo_t balgo_x_;

/*
* stores the references to the output tensors of GEMM.
*/
core23::Tensor identity_tensor_;

/*
* initializers for this layer.
*/
std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;

core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; }

public:
/**
* forward pass
*/
void fprop(bool is_train) final;
/**
* backward pass
*/
void bprop() final;
/*
* initialize for cublasGemmEx
*/
void initialize() final;
/*
* algorithm search for cublasGemmEx
*/
void search_algorithm() final;

/**
* This is the constructor of the Core23TempFullyConnectedLayer.
* It will check whether the format combination of all tensors is supported or not.
* Only two kinds of tensor formats are supported:
* (1) weight, input, output, wgrad are all in row-major.
* (2) weight, input, output, wgrad are all in column-major.
* @param bottom_tensor: stores the tensor from bottom layer
* @param top_tensor: stores the tensor to top layer
* @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
* (col-major)
*/
Core23TempFullyConnectedLayer(
const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
const std::shared_ptr<GPUResource>& gpu_resource,
std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer&) = delete;
Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&);
};

} // namespace HugeCTR
Loading

0 comments on commit 398d8e0

Please sign in to comment.