From 8d0a2b2b13de7644a31808bd265ee210c22d3013 Mon Sep 17 00:00:00 2001 From: Ray Wang Date: Wed, 13 Sep 2023 07:45:47 +0000 Subject: [PATCH] Remove all trainable layers using legacy tensor --- HugeCTR/include/layers/batch_norm_layer.hpp | 106 +- .../include/layers/fully_connected_layer.hpp | 78 +- .../layers/fully_connected_layer_half.hpp | 87 +- .../layers/fused_fully_connected_layer.hpp | 82 +- .../fused_relu_bias_fully_connected_layer.hpp | 215 +-- HugeCTR/include/layers/gru_layer.hpp | 89 +- HugeCTR/include/layers/layer_norm_layer.hpp | 80 +- HugeCTR/include/layers/mlp_layer.hpp | 95 +- HugeCTR/include/layers/multi_cross_layer.hpp | 204 +-- .../include/layers/weight_multiply_layer.hpp | 68 +- HugeCTR/include/network_helpers.hpp | 4 +- HugeCTR/include/pybind/model.hpp | 2 +- HugeCTR/include/trainable_layer.hpp | 193 +-- HugeCTR/src/layers/batch_norm_layer.cu | 217 +-- HugeCTR/src/layers/fully_connected_layer.cu | 369 +---- .../src/layers/fully_connected_layer_half.cu | 456 +----- .../src/layers/fused_fully_connected_layer.cu | 344 +--- .../fused_relu_bias_fully_connected_layer.cu | 796 +--------- HugeCTR/src/layers/gru_layer.cu | 391 +---- HugeCTR/src/layers/layer_norm_layer.cu | 156 +- HugeCTR/src/layers/mlp_layer.cu | 285 +--- HugeCTR/src/layers/multi_cross_layer.cu | 1392 ++--------------- HugeCTR/src/layers/weight_multiply_layer.cu | 144 +- .../src/pybind/add_dense_layer_helpers.cpp | 58 +- .../batch_norm_layer_test.cpp | 5 +- .../fully_connected_layer_half_test.cpp | 8 +- .../fully_connected_layer_test.cpp | 8 +- .../fused_fully_connected_layer_test.cpp | 4 +- ...d_relu_bias_fully_connected_layer_test.cpp | 2 +- .../core23_layer_test/gru_layer_test.cpp | 4 +- .../layer_norm_layer_test.cpp | 5 +- test/utest/core23_layer_test/mlp_test.cpp | 24 +- .../multi_cross_layer_test.cpp | 6 +- .../multi_head_attention_layer_test.cpp | 2 +- .../trainable_layer_test.cpp | 4 +- .../weight_multiply_layer_test.cpp | 4 +- .../batch_norm_layer_test_old.cpp | 290 ---- .../fully_connected_layer_half_test_old.cpp | 283 ---- .../fully_connected_layer_test_old.cpp | 263 ---- .../fused_fully_connected_layer_test_old.cpp | 204 --- ...lu_bias_fully_connected_layer_test_old.cpp | 217 --- .../group_dense_layer_test_old.cpp | 583 ------- .../legacy_layer_test/gru_layer_test_old.cpp | 570 ------- .../layer_norm_layer_test_old.cpp | 339 ---- test/utest/legacy_layer_test/mlp_test_old.cpp | 619 -------- .../multi_cross_layer_test_old.cpp | 839 ---------- .../trainable_layer_test_old.cpp | 135 -- .../weight_multiply_layer_test_old.cpp | 173 -- .../utest/loss/loss_with_regularizer_test.cpp | 8 +- test/utest/network/network_build_test.cpp | 2 +- .../loss_with_regularizer_test.cpp | 252 --- 51 files changed, 374 insertions(+), 10390 deletions(-) delete mode 100644 test/utest/legacy_layer_test/batch_norm_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/fully_connected_layer_half_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/fully_connected_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/fused_fully_connected_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/fused_relu_bias_fully_connected_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/group_dense_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/gru_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/layer_norm_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/mlp_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/multi_cross_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/trainable_layer_test_old.cpp delete mode 100644 test/utest/legacy_layer_test/weight_multiply_layer_test_old.cpp delete mode 100644 test/utest/regularizers/loss_with_regularizer_test.cpp diff --git a/HugeCTR/include/layers/batch_norm_layer.hpp b/HugeCTR/include/layers/batch_norm_layer.hpp index 06c9cdfc40..66370ff76f 100644 --- a/HugeCTR/include/layers/batch_norm_layer.hpp +++ b/HugeCTR/include/layers/batch_norm_layer.hpp @@ -31,15 +31,6 @@ class BatchNormLayer : public TrainableLayer { using Base = TrainableLayer; using WeightType = typename Base::WeightType; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - public: /** * BatchNorm parameters @@ -51,20 +42,14 @@ class BatchNormLayer : public TrainableLayer { /** * Ctor of BatchNormLayer. - * @param weight_buff weight buffer for internal gamma/beta tensors - * @param wgrad_buff gradient buffer for internal gamma/beta tensors * @param in_tensor the input tensor * @param out_tensor the output tensor which has the same dim with in_tensor * @param params BatchNorm parameters * @param cudnn_handle cuDNN handle created externally * @param device_id the id of GPU where this layer belongs */ - BatchNormLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, const Params& params, - const std::shared_ptr& gpu_resource, + BatchNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, + const Params& params, const std::shared_ptr& gpu_resource, std::vector initializer_types = std::vector()); ~BatchNormLayer() override; @@ -89,91 +74,6 @@ class BatchNormLayer : public TrainableLayer { */ std::string get_no_trained_params_in_string() override; - std::vector get_tensors_for_non_trainable_params() override; - - private: - /** - * A method of defining how gamma and beta are initialized. - * Gamma is initialized to 1s while Beta is 0ed. - * Override this function to change the initialization behavior. - */ - std::unique_ptr get_default_initializer(const int index) override; - - const Params params_; - const cudnnBatchNormMode_t mode_; - cudnnTensorDescriptor_t in_out_desc_; - cudnnTensorDescriptor_t gamma_beta_desc_; - - // these four pointers are just for convenience - // they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_ - Tensor2 gamma_; - Tensor2 beta_; - Tensor2 gamma_grad_; - Tensor2 beta_grad_; - - // these tensors are internal only managed by smart ptrs - Tensor2 result_running_mean_; - Tensor2 result_running_var_; - Tensor2 result_save_mean_; - Tensor2 result_save_inv_var_; - - // host arCore23Temp to do device-to-host copy for mean and var - Tensor2 h_result_running_mean_; - Tensor2 h_result_running_var_; -}; - -/** - * BatchNorm layer based on cuDNN - */ -template -class Core23TempBatchNormLayer : public Core23TempTrainableLayer { - using Base = Core23TempTrainableLayer; - using WeightType = typename Base::WeightType; - - public: - /** - * BatchNorm parameters - */ - struct Params { - double factor; /**< moving average computation factor*/ - double eps; /**< small value to avoid divide-by-zero error*/ - }; - - /** - * Ctor of Core23TempBatchNormLayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param params BatchNorm parameters - * @param cudnn_handle cuDNN handle created externally - * @param device_id the id of GPU where this layer belongs - */ - Core23TempBatchNormLayer( - const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()); - ~Core23TempBatchNormLayer() override; - - void initialize() override; - - /** - * A method of implementing the forward pass of BatchNorm - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - - /** - * A method of implementing the forward pass of BatchNorm - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - /** - * A method to get mean and variance which are needed for inference as string. - * Session is in charge of calling this method and store the contensts to file. - * See Session::download_params_to_file() for more detailed information. - */ - std::string get_no_trained_params_in_string() override; - std::vector get_non_trainable_params_as_tensors() override; private: @@ -202,7 +102,7 @@ class Core23TempBatchNormLayer : public Core23TempTrainableLayer { core23::Tensor result_save_mean_; core23::Tensor result_save_inv_var_; - // host arCore23Temp to do device-to-host copy for mean and var + // host ar to do device-to-host copy for mean and var core23::Tensor h_result_running_mean_; core23::Tensor h_result_running_var_; }; diff --git a/HugeCTR/include/layers/fully_connected_layer.hpp b/HugeCTR/include/layers/fully_connected_layer.hpp index 07f04e7e65..f5b22b16fc 100644 --- a/HugeCTR/include/layers/fully_connected_layer.hpp +++ b/HugeCTR/include/layers/fully_connected_layer.hpp @@ -41,16 +41,7 @@ class FullyConnectedLayer : public TrainableLayer { cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT}; cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT}; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - Tensors2& get_in_tensors(bool is_train) { return in_tensors_; } + std::vector& get_in_tensors(bool is_train) { return this->input_tensors_; } public: /** @@ -71,16 +62,12 @@ class FullyConnectedLayer : public TrainableLayer { * Only two kinds of tensor formats are supported: * (1) weight, input, output, wgrad are all in row-major. * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass * @param in_tensor: stores the input tensor * @param out_tensor: stores the output tensor * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH * (col-major) */ - FullyConnectedLayer(const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, + FullyConnectedLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const std::shared_ptr& gpu_resource, bool use_mixed_precision, bool enable_tf32_compute, std::vector initializer_types = std::vector()); @@ -97,65 +84,4 @@ class FullyConnectedLayer : public TrainableLayer { std::unique_ptr get_default_initializer(const int index) override; }; -template -class Core23TempFullyConnectedLayer; - -/** - * @brief - * This class implements the fully connected layer. - */ -template <> -class Core23TempFullyConnectedLayer : public Core23TempTrainableLayer { - private: - const bool use_mixed_precision_{false}; - const bool enable_tf32_compute_{false}; - // Optimized cublasGemmEx algorithm selection - cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT}; - cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT}; - cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT}; - - std::vector& get_in_tensors(bool is_train) { return this->input_tensors_; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - /* - * algorithm search for cublasGemmEx - */ - void search_algorithm() final; - /** - * This is the constructor of the Core23TempFullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param in_tensor: stores the input tensor - * @param out_tensor: stores the output tensor - * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH - * (col-major) - */ - Core23TempFullyConnectedLayer( - const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, - const std::shared_ptr& gpu_resource, bool use_mixed_precision, - bool enable_tf32_compute, - std::vector initializer_types = std::vector()); - Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer& C) = delete; - Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&); - - private: - /* - * initializers for this layer. - */ - std::unique_ptr get_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_norm_initializer(const int index) override; - std::unique_ptr get_default_initializer(const int index) override; -}; - } // namespace HugeCTR diff --git a/HugeCTR/include/layers/fully_connected_layer_half.hpp b/HugeCTR/include/layers/fully_connected_layer_half.hpp index de5a7b08e6..cb97ff6018 100644 --- a/HugeCTR/include/layers/fully_connected_layer_half.hpp +++ b/HugeCTR/include/layers/fully_connected_layer_half.hpp @@ -38,20 +38,10 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> { cublasGemmAlgo_t balgo_k_; cublasGemmAlgo_t balgo_x_; - /* - * stores the references to the input tensors of this layer. - */ - Tensor2<__half> bottom_tensor_; - - /* - * stores the references to the output tensors of this layer. - */ - Tensor2<__half> top_tensor_; - /* * stores the references to the output tensors of GEMM. */ - Tensor2<__half> identity_tensor_; + core23::Tensor identity_tensor_; /* * initializers for this layer. @@ -61,7 +51,7 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> { std::unique_ptr get_xavier_norm_initializer(const int index) override; std::unique_ptr get_default_initializer(const int index) override; - Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; } + core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; } public: /** @@ -87,87 +77,16 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> { * Only two kinds of tensor formats are supported: * (1) weight, input, output, wgrad are all in row-major. * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass * @param bottom_tensor: stores the tensor from bottom layer * @param top_tensor: stores the tensor to top layer * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH * (col-major) */ - FullyConnectedLayer(const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor, + FullyConnectedLayer(const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor, const std::shared_ptr& gpu_resource, std::vector initializer_types = std::vector()); FullyConnectedLayer(const FullyConnectedLayer&) = delete; FullyConnectedLayer& operator=(const FullyConnectedLayer&); }; -/** - * @brief - * This class implements the fully connected layer. - */ -template <> -class Core23TempFullyConnectedLayer<__half> : public Core23TempTrainableLayer<__half> { - // Optimized cublasGemmEx algorithm selection - cublasGemmAlgo_t falgo_b_; - cublasGemmAlgo_t falgo_k_; - cublasGemmAlgo_t balgo_b_; - cublasGemmAlgo_t balgo_k_; - cublasGemmAlgo_t balgo_x_; - - /* - * stores the references to the output tensors of GEMM. - */ - core23::Tensor identity_tensor_; - - /* - * initializers for this layer. - */ - std::unique_ptr get_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_norm_initializer(const int index) override; - std::unique_ptr get_default_initializer(const int index) override; - - core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - /* - * initialize for cublasGemmEx - */ - void initialize() final; - /* - * algorithm search for cublasGemmEx - */ - void search_algorithm() final; - - /** - * This is the constructor of the Core23TempFullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param bottom_tensor: stores the tensor from bottom layer - * @param top_tensor: stores the tensor to top layer - * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH - * (col-major) - */ - Core23TempFullyConnectedLayer( - const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()); - Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer&) = delete; - Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&); -}; - } // namespace HugeCTR diff --git a/HugeCTR/include/layers/fused_fully_connected_layer.hpp b/HugeCTR/include/layers/fused_fully_connected_layer.hpp index 468d363e20..46979f487f 100644 --- a/HugeCTR/include/layers/fused_fully_connected_layer.hpp +++ b/HugeCTR/include/layers/fused_fully_connected_layer.hpp @@ -24,86 +24,12 @@ #include namespace HugeCTR { -/** - * @brief - * This class implements the fully connected layer. - */ -class FusedFullyConnectedLayer : public TrainableLayer<__half> { - // Optimized cublasGemmEx algorithm selection - cublasGemmAlgo_t falgo_k_{CUBLAS_GEMM_DEFAULT}; - cublasGemmAlgo_t balgo_k_{CUBLAS_GEMM_DEFAULT}; - cublasGemmAlgo_t balgo_x_{CUBLAS_GEMM_DEFAULT}; - - /* - * stores the references to the bottom tensors of this layer. - */ - Tensor2<__half> bottom_tensor_; - - /* - * stores the references to the top tensors of this layer. - */ - Tensor2<__half> top_tensor_; - - /* - * stores the references to the intermediate top tensors of this layer. - */ - Tensor2<__half> middle_tensor_; - - /* - * stores the references to the intermediate bias grad tensors of this layer. - */ - Tensor2 bias_grad_tensor_; - - std::unique_ptr get_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_norm_initializer(const int index) override; - std::unique_ptr get_default_initializer(const int index) override; - - Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - /* - * algorithm search for cublasGemmEx - */ - void search_algorithm() final; - /** - * This is the constructor of the FullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass - * @param bottom_tensor: stores the tensor from bottom layer - * @param top_tensor: stores the tensor to top layer - * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH - * (col-major) - */ - FusedFullyConnectedLayer( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()); - FusedFullyConnectedLayer(const FusedFullyConnectedLayer&) = delete; - FusedFullyConnectedLayer& operator=(const FusedFullyConnectedLayer&); -}; /** * @brief * This class implements the fully connected layer. */ -class Core23TempFusedFullyConnectedLayer : public Core23TempTrainableLayer<__half> { +class FusedFullyConnectedLayer : public TrainableLayer<__half> { // Optimized cublasGemmEx algorithm selection cublasGemmAlgo_t falgo_k_{CUBLAS_GEMM_DEFAULT}; cublasGemmAlgo_t balgo_k_{CUBLAS_GEMM_DEFAULT}; @@ -150,12 +76,12 @@ class Core23TempFusedFullyConnectedLayer : public Core23TempTrainableLayer<__hal * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH * (col-major) */ - Core23TempFusedFullyConnectedLayer( + FusedFullyConnectedLayer( const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor, const std::shared_ptr& gpu_resource, std::vector initializer_types = std::vector()); - Core23TempFusedFullyConnectedLayer(const Core23TempFusedFullyConnectedLayer&) = delete; - Core23TempFusedFullyConnectedLayer& operator=(const Core23TempFusedFullyConnectedLayer&); + FusedFullyConnectedLayer(const FusedFullyConnectedLayer&) = delete; + FusedFullyConnectedLayer& operator=(const FusedFullyConnectedLayer&); }; } // namespace HugeCTR diff --git a/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp b/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp index 8aea519957..6013cc8bf0 100644 --- a/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp +++ b/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp @@ -62,208 +62,6 @@ class FusedReluBiasFullyConnectedLayer : public TrainableLayer<__half> { // std::vector> master_weights_; It is inherited from Layer, and named as // weights_; - /* - * stores the weight tensors for compute of this layer. - */ - // std::vector> weights_; - Tensors2<__half> weights_half_; - - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2<__half> weights_grad_; - - /* - * stores the references to the bottom tensors of this layer. - */ - Tensor2<__half> train_in_tensor_; - Tensor2<__half> mask_in_tensor_; - Tensor2<__half> dRelu_in_tensor_; - Tensor2<__half> db_in_tensor_; - Tensor2 mask_in_tensor_temp_; - - /* - * stores the references to the top tensors of this layer. - */ - Tensor2<__half> train_out_tensor_; - Tensor2<__half> mask_out_tensor_; - Tensor2<__half> dRelu_out_tensor_; - Tensor2<__half> db_out_tensor_; - - /* - * stores the references to the output tensors of GEMM. - */ - Tensor2<__half> identity_tensor_; - - /* - * stores the references to the intermediate bias grad tensors of this layer. - */ - Tensor2 bias_grad_tensor_; - - void* bprop_fusion_; - - /* - * stores the position of this layer in the network - */ - FcPosition_t pos_; - - /* - * stores the activation function of this layer - */ - Activation_t act_; - - /* - * skip the computation of dgrad or not - */ - bool skip_dgrad_; - - /* - * indicates whether overlap dgrad and wgrad - */ - bool async_mlp_wgrad_; - - /* - * determines the kind of fusion pattern. - * There are two fuse patterns available: - * (fuse_wb_ == true) DGRAD + DReLU, WGRAD + BGRAD - * (fuse_wb_ == false) DGRAD + DReLU + BGRAD, WGRAD - */ - bool fuse_wb_; - - /* - * indicates whether there is mask in tensor for Head layer - */ - bool head_mask_in_; - - bool event_overlap_created_; - - cublasHandle_t cublas_handle_wgrad_; - - /* - * record the event when starting to compute wgrad - */ - cudaEvent_t event_overlap_; - - /* - * record the event when finishing computing wgrad (host, async) - */ - // cudaEvent_t event_overlap_end_; - - std::unique_ptr get_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_norm_initializer(const int index) override; - std::unique_ptr get_default_initializer(const int index) override; - - Tensor2<__half>& get_bottom_tensor_fprop(bool is_train) { return train_in_tensor_; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - /* - * algorithm search for cublasGemmEx - */ - void search_algorithm() final; - void initialize() final; - void initialize_dgrad(); - void initialize_wgrad(); - - /* - * Interfaces for unit tests to debug - */ - Tensors2<__half>& get_weights_half_tensor() { return weights_half_; } - Tensors2<__half>& get_weights_grad_tensor() { return weights_grad_; } - - /* - * return the cuda event recording the finish point of wgrad - */ - // cudaEvent_t& get_event_overlap_end() { return event_overlap_end_; } - - /** - * This is the constructor of the FullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass - * @param train_bottom_tensor_fprop: stores the tensor from bottom layer for forward propagation - * @param train_bottom_tensor_fprop: stores the tensor from bottom layer for forward propagation - * @param top_tensor_fprop: stores the tensor to top layer when forward propagation - * @param top_tensor_bprop: stores the tensor to top layer when backward propagation - * @param pos: stores the position of this layer: HEAD, BODY, TAIL, ISOLATED. - */ - FusedReluBiasFullyConnectedLayer( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& train_in_tensor, const Tensor2<__half>& mask_in_tensor, - const Tensor2<__half>& dRelu_in_tensor, const Tensor2<__half>& db_in_tensor, - const Tensor2<__half>& train_out_tensor, const Tensor2<__half>& mask_out_tensor, - const Tensor2<__half>& dRelu_out_tensor, Tensor2<__half>& db_out_tensor, - const std::shared_ptr& gpu_resource, const FcPosition_t& pos, - const Activation_t& act, const bool& skip_dgrad, - std::vector initializer_types = std::vector(), - const bool async_mlp_wgrad = false, const bool head_mask_in = false, - const bool fuse_wb = false); - FusedReluBiasFullyConnectedLayer(const FusedReluBiasFullyConnectedLayer&) = delete; - FusedReluBiasFullyConnectedLayer& operator=(const FusedReluBiasFullyConnectedLayer&); - - ~FusedReluBiasFullyConnectedLayer() { - try { - if (event_overlap_created_) { - CudaDeviceContext context(get_device_id()); - HCTR_LIB_THROW(cudaEventDestroy(event_overlap_)); - } - } catch (const std::exception& error) { - HCTR_LOG(INFO, WORLD, "FusedReluBiasFullyConnectedLayer Dtor error:%s", error.what()); - } - }; -}; - -/** - * @brief - * This class implements the fully connected layer. - */ -class Core23TempFusedReluBiasFullyConnectedLayer : public Core23TempTrainableLayer<__half> { - // Optimized cublasGemmEx algorithm selection - cublasLtMatmulAlgo_t falgo_k_; - cublasLtMatmulAlgo_t balgo_dRelu_; - cublasLtMatmulAlgo_t balgo_wgrad_; - cublasGemmAlgo_t balgo_k_{CUBLAS_GEMM_DEFAULT}; - cublasGemmAlgo_t balgo_x_{CUBLAS_GEMM_DEFAULT}; - cublasGemmAlgo_t balgo_b_{CUBLAS_GEMM_DEFAULT}; - - cublasLtMatrixLayout_t cublas_kernel_desc_ = NULL; - cublasLtMatrixLayout_t cublas_top_desc_ = NULL; - cublasLtMatrixLayout_t cublas_bottom_desc_ = NULL; - cublasLtMatrixLayout_t cublas_dRelu_top_desc_ = NULL; - cublasLtMatrixLayout_t cublas_dRelu_bottom_desc_ = NULL; - - cublasLtMatmulDesc_t cublas_op_desc_ = NULL; - cublasLtMatmulDesc_t cublas_op_desc_bprop_ = NULL; - cublasLtMatmulDesc_t cublas_op_desc_wgrad_ = NULL; - - cublasLtMatmulPreference_t cublas_preference_ = NULL; - cublasLtMatmulPreference_t cublas_preference_dRelu_ = NULL; - cublasLtMatmulPreference_t cublas_preference_wgrad_ = NULL; - size_t cublaslt_workspace_size_ = 1024 * 1024 * 32; - void* cublaslt_workspace_; - void* cublaslt_workspace_dRelu_; - void* cublaslt_workspace_wgrad_; - - /* - * stores the weight tensors for compute of this layer. - */ - // std::vector> master_weights_; It is inherited from Layer, and named as - // weights_; - /* * stores the weight tensors for compute of this layer. */ @@ -388,7 +186,7 @@ class Core23TempFusedReluBiasFullyConnectedLayer : public Core23TempTrainableLay * @param top_tensor_bprop: stores the tensor to top layer when backward propagation * @param pos: stores the position of this layer: HEAD, BODY, TAIL, ISOLATED. */ - Core23TempFusedReluBiasFullyConnectedLayer( + FusedReluBiasFullyConnectedLayer( const core23::Tensor& train_in_tensor, const core23::Tensor& mask_in_tensor, const core23::Tensor& dRelu_in_tensor, const core23::Tensor& db_in_tensor, const core23::Tensor& train_out_tensor, const core23::Tensor& mask_out_tensor, @@ -398,20 +196,17 @@ class Core23TempFusedReluBiasFullyConnectedLayer : public Core23TempTrainableLay std::vector initializer_types = std::vector(), const bool async_mlp_wgrad = false, const bool head_mask_in = false, const bool fuse_wb = false); - Core23TempFusedReluBiasFullyConnectedLayer(const Core23TempFusedReluBiasFullyConnectedLayer&) = - delete; - Core23TempFusedReluBiasFullyConnectedLayer& operator=( - const Core23TempFusedReluBiasFullyConnectedLayer&); + FusedReluBiasFullyConnectedLayer(const FusedReluBiasFullyConnectedLayer&) = delete; + FusedReluBiasFullyConnectedLayer& operator=(const FusedReluBiasFullyConnectedLayer&); - ~Core23TempFusedReluBiasFullyConnectedLayer() { + ~FusedReluBiasFullyConnectedLayer() { try { if (event_overlap_created_) { CudaDeviceContext context(get_device_id()); HCTR_LIB_THROW(cudaEventDestroy(event_overlap_)); } } catch (const std::exception& error) { - HCTR_LOG(INFO, WORLD, "Core23TempFusedReluBiasFullyConnectedLayer Dtor error:%s", - error.what()); + HCTR_LOG(INFO, WORLD, "FusedReluBiasFullyConnectedLayer Dtor error:%s", error.what()); } }; }; diff --git a/HugeCTR/include/layers/gru_layer.hpp b/HugeCTR/include/layers/gru_layer.hpp index 06037d5058..007c12311b 100644 --- a/HugeCTR/include/layers/gru_layer.hpp +++ b/HugeCTR/include/layers/gru_layer.hpp @@ -25,26 +25,14 @@ namespace HugeCTR { * GRU function (Interest Extractor Layer) as a derived class of Layer */ template -class GRULayer : public Layer { +class GRULayer : public TrainableLayer { cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT}; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; size_t workSpaceSize; size_t reserveSpaceSize; size_t inputTensorSize, outputTensorSize, hiddenTensorSize; - Tensors2 &get_in_tensors(bool is_train) { return in_tensors_; } + std::vector &get_in_tensors(bool is_train) { return this->input_tensors_; } public: /** @@ -64,78 +52,11 @@ class GRULayer : public Layer { * @param out_tensor the output tensor which has the same dim with in_tensor * @param device_id the id of GPU where this layer belongs */ - GRULayer(const std::shared_ptr> &weight_buff, - const std::shared_ptr> &wgrad_buff, const Tensor2 &in_tensor, - const Tensor2 &out_tensor, size_t hiddenSize, size_t batch_size, size_t SeqLength, - size_t embedding_vec_size, const std::shared_ptr &gpu_resource, + GRULayer(const core23::Tensor &in_tensor, const core23::Tensor &out_tensor, int64_t hiddenSize, + int64_t batch_size, int64_t SeqLength, int64_t embedding_vec_size, + const std::shared_ptr &gpu_resource, std::vector initializer_types = std::vector()); - private: - int *seqLengthArray = NULL; - int *devSeqLengthArray = NULL; - void *weightSpace = NULL; - void *dweightSpace = NULL; - void *workSpace = NULL; - void *reserveSpace = NULL; - void *hx = NULL; - - cudnnHandle_t cudnnHandle; - cudnnRNNDescriptor_t rnnDesc; - cudnnRNNDataDescriptor_t in_Desc; - cudnnRNNDataDescriptor_t out_Desc; - cudnnTensorDescriptor_t cDesc; - cudnnTensorDescriptor_t hDesc; - cudnnDropoutDescriptor_t dropoutDesc; - cudnnDataType_t data_type; - - int dimHidden[3]; - int strideHidden[3]; - unsigned long long seed; - size_t stateSize; - void *states; - float dropout = 0; - size_t weightSpaceSize; - size_t seqLength_, miniBatch, embedding_vec_size_, m = 512; - int hiddenSize_; // = 512; //half of the seqLength - int numLinearLayers; -}; - -/** - * GRU function (Interest Extractor Layer) as a derived class of Layer - */ -template -class Core23TempGRULayer : public Core23TempTrainableLayer { - cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT}; - - size_t workSpaceSize; - size_t reserveSpaceSize; - size_t inputTensorSize, outputTensorSize, hiddenTensorSize; - - std::vector &get_in_tensors(bool is_train) { return this->input_tensors_; } - - public: - /** - * A method of implementing the forward pass of GRU - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) final; - /** - * A method of implementing the backward pass of GRU - * @param stream CUDA stream where the backward propagation is executed - */ - void bprop() final; - - /** - * Ctor of Core23TempGRULayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param device_id the id of GPU where this layer belongs - */ - Core23TempGRULayer(const core23::Tensor &in_tensor, const core23::Tensor &out_tensor, - int64_t hiddenSize, int64_t batch_size, int64_t SeqLength, - int64_t embedding_vec_size, const std::shared_ptr &gpu_resource, - std::vector initializer_types = std::vector()); - private: int *seqLengthArray = nullptr; int *devSeqLengthArray = nullptr; diff --git a/HugeCTR/include/layers/layer_norm_layer.hpp b/HugeCTR/include/layers/layer_norm_layer.hpp index 20955a1765..a9ca99408b 100644 --- a/HugeCTR/include/layers/layer_norm_layer.hpp +++ b/HugeCTR/include/layers/layer_norm_layer.hpp @@ -30,15 +30,6 @@ template class LayerNormLayer : public TrainableLayer { using Base = TrainableLayer; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - public: /** * LayerNorm parameters @@ -48,21 +39,14 @@ class LayerNormLayer : public TrainableLayer { }; /** * Ctor of LayerNormLayer. - * @param master_weight_buff master_weight buffer for mixed precision training - * @param weight_buff weight buffer for internal gamma/beta tensors - * @param wgrad_buff gradient buffer for internal gamma/beta tensors * @param in_tensor the input tensor * @param out_tensor the output tensor which has the same dim with in_tensor * @param params LayerNorm parameters * @param cudnn_handle cuDNN handle created externally * @param device_id the id of GPU where this layer belongs */ - LayerNormLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, const Params& params, - const std::shared_ptr& gpu_resource, + LayerNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, + const Params& params, const std::shared_ptr& gpu_resource, std::vector initializer_types = std::vector()); /** @@ -77,66 +61,6 @@ class LayerNormLayer : public TrainableLayer { */ void bprop() override; - private: - /** - * A method of defining how gamma and beta are initialized. - * Gamma is initialized to 1s while Beta is 0ed. - * Override this function to change the initialization behavior. - */ - std::unique_ptr get_default_initializer(const int index) override; - const Params params_; - - // these four pointers are just for convenience - // they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_ - Tensor2 gamma_; - Tensor2 beta_; - Tensor2 gamma_grad_; - Tensor2 beta_grad_; - - // these tensors are internal only managed by smart ptrs - Tensor2 result_save_mean_; - Tensor2 result_save_var_; -}; - -/** - * LayerNorm layer - */ -template -class Core23TempLayerNormLayer : public Core23TempTrainableLayer { - using Base = Core23TempTrainableLayer; - - public: - /** - * LayerNorm parameters - */ - struct Params { - double eps; /**< small value to avoid divide-by-zero error*/ - }; - /** - * Ctor of Core23TempLayerNormLayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param params LayerNorm parameters - * @param cudnn_handle cuDNN handle created externally - * @param device_id the id of GPU where this layer belongs - */ - Core23TempLayerNormLayer( - const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()); - - /** - * A method of implementing the forward pass of LayerNorm - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - - /** - * A method of implementing the forward pass of LayerNorm - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - private: /** * A method of defining how gamma and beta are initialized. diff --git a/HugeCTR/include/layers/mlp_layer.hpp b/HugeCTR/include/layers/mlp_layer.hpp index bce5bc20ec..a591714f32 100644 --- a/HugeCTR/include/layers/mlp_layer.hpp +++ b/HugeCTR/include/layers/mlp_layer.hpp @@ -28,15 +28,13 @@ namespace HugeCTR { template class MLPLayer : public TrainableLayer { - Tensors2 bottom_tensors_; - Tensors2 top_tensors_; + std::vector train_tensors_, mask_tensors_, dact_tensors_, db_tensors_; - Tensors2 train_tensors_, mask_tensors_, dact_tensors_, db_tensors_; + std::vector kernels_; + std::vector biases_; + std::vector kernels_grad_; - Tensors2 kernels_; - Tensors2 biases_; - Tensors2 kernels_grad_; - std::vector num_outputs_; + std::vector num_outputs_; std::vector acts_; std::vector output_mask_; @@ -59,13 +57,10 @@ class MLPLayer : public TrainableLayer { std::unique_ptr get_default_initializer(const int index) override; public: - MLPLayer(const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensors2& bottom_tensors, const Tensors2& top_tensors, - const std::vector& num_outputs, const std::shared_ptr& gpu_resource, - const std::vector& acts, const std::vector& use_bias, + MLPLayer(const std::vector& bottom_tensors, + const std::vector& top_tensors, const std::vector& num_outputs, + const std::shared_ptr& gpu_resource, const std::vector& acts, + const std::vector& use_bias, std::vector initializer_types = std::vector(), bool skip_head_dgrad = false, bool async_wgrad = false, bool fuse_wb = false, bool enable_tf32_compute = false); @@ -81,76 +76,6 @@ class MLPLayer : public TrainableLayer { void initialize() final; - /* - * Interfaces for unit tests to debug - */ - Tensor2& get_kernel(int index) { return kernels_[index]; } - Tensor2& get_bias(int index) { return biases_[index]; } - Tensor2& get_kernel_grad(int index) { return kernels_grad_[index]; } - Tensor2& get_bias_grad(int index) { return db_tensors_[index]; } - Tensors2& get_inner_tensors() { return train_tensors_; } - Tensors2& get_input_tensors() { return bottom_tensors_; } - Tensors2& get_output_tensors() { return top_tensors_; } - - ~MLPLayer() { - CudaDeviceContext context(this->get_device_id()); - if (event_overlap_created_) { - cudaEventDestroy(event_overlap_); - } - }; -}; - -template -class Core23TempMLPLayer : public Core23TempTrainableLayer { - std::vector train_tensors_, mask_tensors_, dact_tensors_, db_tensors_; - - std::vector kernels_; - std::vector biases_; - std::vector kernels_grad_; - - std::vector num_outputs_; - std::vector acts_; - - std::vector output_mask_; - std::vector use_bias_; - - bool async_wgrad_; - bool fuse_wb_; - bool enable_tf32_compute_; - bool skip_head_dgrad_; - - bool event_overlap_created_; - cudaEvent_t event_overlap_; - std::vector> layer_desc_; - std::vector> layer_algo_; - FusedFCLayerFunctors layer_functors_; - - std::unique_ptr get_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_norm_initializer(const int index) override; - std::unique_ptr get_default_initializer(const int index) override; - - public: - Core23TempMLPLayer(const std::vector& bottom_tensors, - const std::vector& top_tensors, - const std::vector& num_outputs, - const std::shared_ptr& gpu_resource, - const std::vector& acts, const std::vector& use_bias, - std::vector initializer_types = std::vector(), - bool skip_head_dgrad = false, bool async_wgrad = false, bool fuse_wb = false, - bool enable_tf32_compute = false); - - Core23TempMLPLayer(const Core23TempMLPLayer& C) = delete; - Core23TempMLPLayer& operator=(const Core23TempMLPLayer&); - - void fprop(bool is_train) final; - - void bprop() final; - - void search_algorithm() final; - - void initialize() final; - /* * Interfaces for unit tests to debug */ @@ -162,7 +87,7 @@ class Core23TempMLPLayer : public Core23TempTrainableLayer { auto& get_input_tensors() { return this->input_tensors_; } auto& get_output_tensors() { return this->output_tensors_; } - ~Core23TempMLPLayer() { + ~MLPLayer() { CudaDeviceContext context(this->get_device_id()); if (event_overlap_created_) { cudaEventDestroy(event_overlap_); diff --git a/HugeCTR/include/layers/multi_cross_layer.hpp b/HugeCTR/include/layers/multi_cross_layer.hpp index ca03ef814a..8902b18a2c 100644 --- a/HugeCTR/include/layers/multi_cross_layer.hpp +++ b/HugeCTR/include/layers/multi_cross_layer.hpp @@ -28,162 +28,6 @@ struct MultiCrossForwardFunctor { MultiCrossForwardFunctor(const MultiCrossForwardFunctor&) = delete; MultiCrossForwardFunctor& operator=(const MultiCrossForwardFunctor&) = delete; - void operator()(cudaStream_t stream, cublasHandle_t cublas_handle, const Tensor2& input_tensor, - const Tensors2& kernel_tensors, const Tensors2& bias_tensors, - Tensors2& layer_output_tensors, Tensors2& layer_hidden_tensors, - int num_layers) const; -}; -template -struct MultiCrossForwardFunctorv2 { - GemmFunctor gemm_functor_; - MultiCrossForwardFunctorv2() = default; - MultiCrossForwardFunctorv2(const MultiCrossForwardFunctorv2&) = delete; - MultiCrossForwardFunctorv2& operator=(const MultiCrossForwardFunctorv2&) = delete; - void search_algorithm(T* bottom, T* top, T* kernel, size_t batch_size, size_t input_size, - size_t output_size, const CublasFusedFCLayerDesc& cublas_layer_desc, - cublasLtHandle_t cublaslt_handle, cudaStream_t stream); - void operator()(cudaStream_t stream, const Tensor2& input_tensor, - const Tensors2& kernel_tensors, const Tensors2& bias_tensors, - Tensors2& XU_tensors, Tensors2& layer_output_tensors, - Tensors2& layer_hidden_tensors, int num_layers, - const std::vector>& xu_descr_, - const std::vector>& xuvb_descr_, - const std::vector>& xu_fprop_algo_, - const std::vector>& xuvb_fprop_algo_, cublasLtHandle_t = nullptr); -}; - -template -struct MultiCrossBackwardFunctorv2 { - GemmFunctor gemm_functor_; - MultiCrossBackwardFunctorv2() = default; - MultiCrossBackwardFunctorv2(const MultiCrossBackwardFunctorv2&) = default; - MultiCrossBackwardFunctorv2& operator=(const MultiCrossBackwardFunctorv2&) = delete; - - void operator()(cudaStream_t dgrad_stream, cudaStream_t wgrad_stream, bool async_wgrad, - cudaEvent_t& event_overlap, const Tensor2& input_tensor, - const Tensors2& kernel_tensors, const Tensors2& act_tensors, - const Tensors2& layer_hidden_tensors, Tensors2& kernel_output_tensors, - Tensors2& grad_tensors, Tensors2& bias_output_tensors, - Tensors2& XU_tensors, Tensor2 accum_dx_tensor_, Tensors2 bprop_bottoms, - int num_layers, const std::vector>& xu_descr_, - const std::vector>& xuvb_descr_, - const std::vector>& du_descrs_bprop_, - const std::vector>& dhidden_descrs_bprop_, - const std::vector>& xu_bprop_algo_, - const std::vector>& xuvb_bprop_algo_, - const std::vector>& du_bprop_algos_, - const std::vector>& dhidden_bprop_algos_, - cublasLtHandle_t cublaslt_handle = nullptr); -}; - -template -struct MultiCrossBackwardFunctor { - MultiCrossBackwardFunctor() = default; - MultiCrossBackwardFunctor(const MultiCrossBackwardFunctor&) = default; - MultiCrossBackwardFunctor& operator=(const MultiCrossBackwardFunctor&) = delete; - - void operator()(cudaStream_t stream, const Tensor2& input_tensor, - const Tensors2& kernel_tensors, const Tensors2& layer_output_tensors, - const Tensors2& layer_hidden_tensors, const Tensor2& grad_tensor, - Tensor2& output_tensor, Tensors2& kernel_output_tensors, - Tensors2& bias_output_tensors, Tensor2& tmp_vec_tensor, - Tensor2 tmp_mat_tensors[], int num_layers) const; -}; - -template -class MultiCrossLayer : public TrainableLayer { - private: - const int num_layers_; - const size_t projection_dim_; - Tensors2 dgrads_; /**< vector of internal blobs' tensors, intermediate dgrad of each - interaction layer: T_4 */ - Tensors2 activation_tensors_; /**< vector of internal blobs' tensors, intermediate output of - each interaction layer: T_4 */ - Tensors2 hidden_tensors_; // DCNv1: x_i * w ; DCNv2: x * x_i * w + b; T_7 - Tensors2 XU_tensors_; // DCNv2: - - Tensor2 tmp_mat_tensors_[4]; //[h,w] - - Tensor2 accum_dx_tensor_; - Tensors2 bprop_bottom_; - Tensor2 tmp_vec_tensor_; //[h,1] - - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - std::vector> xu_descrs_fprop_; - std::vector> xuvb_descrs_fprop_; - std::vector> xu_descrs_bprop_; - std::vector> xuvb_descrs_bprop_; - std::vector> du_descrs_bprop_; - std::vector> dhidden_descrs_bprop_; - - std::vector> xu_fprop_algos_; - std::vector> xuvb_fprop_algos_; - std::vector> xu_bprop_algos_; - std::vector> xuvb_bprop_algos_; - std::vector> du_bprop_algos_; - std::vector> dhidden_bprop_algos_; - - bool enable_tf32_compute_; - bool async_wgrad_ = false; - - MultiCrossForwardFunctorv2 dcnv2_forward_functor_; - MultiCrossBackwardFunctorv2 dcnv2_backward_functor_; - - cudaStream_t wgrad_stream_; - cudaEvent_t event_fork_; - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - Tensors2& get_hidden_tensors() { return hidden_tensors_; }; - Tensors2& get_weight_tensor() { return XU_tensors_; }; - /** - * backward pass - */ - void search_algorithm() override; - void bprop() final; - void initialize() override; - MultiCrossLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, - const std::shared_ptr& gpu_resource, int num_layers, - size_t projection_dim = 0, - std::vector initializer_types = std::vector(), - bool enable_tf32_compute = false, bool async_wgrad = false); - MultiCrossLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blobs_buff, - const Tensors2& in_tensor, const Tensors2& out_tensor, - const std::shared_ptr& gpu_resource, int num_layers, - size_t projection_dim = 0, - std::vector initializer_types = std::vector(), - bool enable_tf32_compute = false, bool async_wgrad = false); - MultiCrossLayer(const MultiCrossLayer&) = delete; - MultiCrossLayer& operator=(const MultiCrossLayer&) = delete; - - private: - std::unique_ptr get_default_initializer(const int index) override; -}; - -template -struct Core23TempMultiCrossForwardFunctor { - Core23TempMultiCrossForwardFunctor() = default; - Core23TempMultiCrossForwardFunctor(const Core23TempMultiCrossForwardFunctor&) = delete; - Core23TempMultiCrossForwardFunctor& operator=(const Core23TempMultiCrossForwardFunctor&) = delete; - void operator()(cudaStream_t stream, cublasHandle_t cublas_handle, const core23::Tensor& input_tensor, const std::vector& kernel_tensors, @@ -192,12 +36,11 @@ struct Core23TempMultiCrossForwardFunctor { std::vector& layer_hidden_tensors, int num_layers) const; }; template -struct Core23TempMultiCrossForwardFunctorv2 { +struct MultiCrossForwardFunctorv2 { GemmFunctor gemm_functor_; - Core23TempMultiCrossForwardFunctorv2() = default; - Core23TempMultiCrossForwardFunctorv2(const Core23TempMultiCrossForwardFunctorv2&) = delete; - Core23TempMultiCrossForwardFunctorv2& operator=(const Core23TempMultiCrossForwardFunctorv2&) = - delete; + MultiCrossForwardFunctorv2() = default; + MultiCrossForwardFunctorv2(const MultiCrossForwardFunctorv2&) = delete; + MultiCrossForwardFunctorv2& operator=(const MultiCrossForwardFunctorv2&) = delete; void search_algorithm(T* bottom, T* top, T* kernel, int64_t batch_size, int64_t input_size, int64_t output_size, const CublasFusedFCLayerDesc& cublas_layer_desc, cublasLtHandle_t cublaslt_handle, cudaStream_t stream); @@ -214,13 +57,12 @@ struct Core23TempMultiCrossForwardFunctorv2 { }; template -struct Core23TempMultiCrossBackwardFunctorv2 { +struct MultiCrossBackwardFunctorv2 { GemmFunctor gemm_functor_; - Core23TempMultiCrossBackwardFunctorv2() = default; - Core23TempMultiCrossBackwardFunctorv2(const Core23TempMultiCrossBackwardFunctorv2&) = delete; - Core23TempMultiCrossBackwardFunctorv2& operator=(const Core23TempMultiCrossBackwardFunctorv2&) = - delete; + MultiCrossBackwardFunctorv2() = default; + MultiCrossBackwardFunctorv2(const MultiCrossBackwardFunctorv2&) = delete; + MultiCrossBackwardFunctorv2& operator=(const MultiCrossBackwardFunctorv2&) = delete; void operator()(cudaStream_t dgrad_stream, cudaStream_t wgrad_stream, bool async_wgrad, cudaEvent_t& event_overlap, const core23::Tensor& input_tensor, const std::vector& kernel_tensors, @@ -243,11 +85,10 @@ struct Core23TempMultiCrossBackwardFunctorv2 { }; template -struct Core23TempMultiCrossBackwardFunctor { - Core23TempMultiCrossBackwardFunctor() = default; - Core23TempMultiCrossBackwardFunctor(const Core23TempMultiCrossBackwardFunctor&) = delete; - Core23TempMultiCrossBackwardFunctor& operator=(const Core23TempMultiCrossBackwardFunctor&) = - delete; +struct MultiCrossBackwardFunctor { + MultiCrossBackwardFunctor() = default; + MultiCrossBackwardFunctor(const MultiCrossBackwardFunctor&) = delete; + MultiCrossBackwardFunctor& operator=(const MultiCrossBackwardFunctor&) = delete; void operator()(cudaStream_t stream, const core23::Tensor& input_tensor, const std::vector& kernel_tensors, @@ -260,7 +101,7 @@ struct Core23TempMultiCrossBackwardFunctor { }; template -class Core23TempMultiCrossLayer : public Core23TempTrainableLayer { +class MultiCrossLayer : public TrainableLayer { private: const int num_layers_; const int64_t projection_dim_; @@ -300,8 +141,8 @@ class Core23TempMultiCrossLayer : public Core23TempTrainableLayer { std::vector> du_bprop_algos_; std::vector> dhidden_bprop_algos_; - Core23TempMultiCrossForwardFunctorv2 dcnv2_forward_functor_; - Core23TempMultiCrossBackwardFunctorv2 dcnv2_backward_functor_; + MultiCrossForwardFunctorv2 dcnv2_forward_functor_; + MultiCrossBackwardFunctorv2 dcnv2_backward_functor_; bool enable_tf32_compute_; bool async_wgrad_ = false; cudaStream_t wgrad_stream_; @@ -320,13 +161,14 @@ class Core23TempMultiCrossLayer : public Core23TempTrainableLayer { void search_algorithm() override; void bprop() final; void initialize() override; - Core23TempMultiCrossLayer( - const std::vector& in_tensors, const std::vector& out_tensors, - const std::shared_ptr& gpu_resource, int num_layers, int64_t projection_dim, - std::vector initializer_types = std::vector(), - bool enable_tf32_compute = false, bool async_wgrad = false); - Core23TempMultiCrossLayer(const Core23TempMultiCrossLayer&) = delete; - Core23TempMultiCrossLayer& operator=(const Core23TempMultiCrossLayer&) = delete; + MultiCrossLayer(const std::vector& in_tensors, + const std::vector& out_tensors, + const std::shared_ptr& gpu_resource, int num_layers, + int64_t projection_dim, + std::vector initializer_types = std::vector(), + bool enable_tf32_compute = false, bool async_wgrad = false); + MultiCrossLayer(const MultiCrossLayer&) = delete; + MultiCrossLayer& operator=(const MultiCrossLayer&) = delete; private: std::unique_ptr get_default_initializer(const int index) override; diff --git a/HugeCTR/include/layers/weight_multiply_layer.hpp b/HugeCTR/include/layers/weight_multiply_layer.hpp index 5eca5c7677..9b668030c2 100644 --- a/HugeCTR/include/layers/weight_multiply_layer.hpp +++ b/HugeCTR/include/layers/weight_multiply_layer.hpp @@ -31,15 +31,6 @@ namespace HugeCTR { */ template class WeightMultiplyLayer : public TrainableLayer { - /* - * stores the weight tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - public: /** * Ctor of WeightMultiplyLayer. @@ -47,12 +38,8 @@ class WeightMultiplyLayer : public TrainableLayer { * @param out_tensor the resulting output tensor * @param device_id the id of GPU where this layer belongs */ - WeightMultiplyLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, - const Tensor2& in_tensor, Tensor2& out_tensor, - const std::vector& weight_dims, + WeightMultiplyLayer(const core23::Tensor& in_tensor, core23::Tensor& out_tensor, + const core23::Shape& weight_dims, const std::shared_ptr& gpu_resource, std::vector initializer_types = std::vector()); @@ -69,57 +56,6 @@ class WeightMultiplyLayer : public TrainableLayer { */ void bprop() override; - private: - // void reserve_master_weight_tensor(const std::shared_ptr>& - // master_weight_buff, - // const std::vector& weight_dims); - std::unique_ptr get_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_uniform_initializer(const int index) override; - std::unique_ptr get_xavier_norm_initializer(const int index) override; - std::unique_ptr get_default_initializer(const int index) override; - - size_t batch_size_; - size_t slot_num_; - size_t embedding_vec_size_; - Tensor2 wgrad_tmp_trans_; -}; - -/** - * Layer which does element-wise product by input tensor X and weight W. - * The input tensor X has dimension: [batch_size, slot_num], while - * the input weight W has dimension: [slot_num, embedding_vec_size]. - * The Core23TempWeightMultiplyLayer will broadcast the value of W to "batch_size" dim - * and broadcast the value of X to embedding_vec_size dim automatically - * when doing element-wise product with X. So, the output tensor has - * the dimension: [batch_size, slot_num*embedding_vec_size]. - */ -template -class Core23TempWeightMultiplyLayer : public Core23TempTrainableLayer { - public: - /** - * Ctor of Core23TempWeightMultiplyLayer. - * @param in_tensor the input tensor - * @param out_tensor the resulting output tensor - * @param device_id the id of GPU where this layer belongs - */ - Core23TempWeightMultiplyLayer( - const core23::Tensor& in_tensor, core23::Tensor& out_tensor, const core23::Shape& weight_dims, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()); - - ~Core23TempWeightMultiplyLayer() override{}; - - /** - * Core23TempWeightMultiplyLayer's forward propagation to do element-wise production - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * Core23TempWeightMultiplyLayer's backward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - private: // void reserve_master_weight_tensor(const std::shared_ptr>& // master_weight_buff, diff --git a/HugeCTR/include/network_helpers.hpp b/HugeCTR/include/network_helpers.hpp index fc2483758a..1d198e979d 100644 --- a/HugeCTR/include/network_helpers.hpp +++ b/HugeCTR/include/network_helpers.hpp @@ -44,9 +44,9 @@ std::vector get_trainable_tensor_vector( } }; for (auto& layer : layers) { - auto trainable_layer = dynamic_cast*>(layer.get()); + auto trainable_layer = dynamic_cast*>(layer.get()); if (!op(trainable_layer)) { - auto trainable_layer = dynamic_cast*>(layer.get()); + auto trainable_layer = dynamic_cast*>(layer.get()); op(trainable_layer); } } diff --git a/HugeCTR/include/pybind/model.hpp b/HugeCTR/include/pybind/model.hpp index 7a000d4e14..71f478ea9b 100644 --- a/HugeCTR/include/pybind/model.hpp +++ b/HugeCTR/include/pybind/model.hpp @@ -43,7 +43,7 @@ namespace HugeCTR { -class Core23TempNetwork; +class Network; namespace { diff --git a/HugeCTR/include/trainable_layer.hpp b/HugeCTR/include/trainable_layer.hpp index 19cc165eb4..2ed1b701bf 100644 --- a/HugeCTR/include/trainable_layer.hpp +++ b/HugeCTR/include/trainable_layer.hpp @@ -22,185 +22,6 @@ #include namespace HugeCTR { -/** - * @brief - * Trainable layer is the common parent of all layers with weights - * @tparams DType the data type of inputs, outputs, and weights - * @tparams use_FP32_weight if specified, the weight data type is in FP32, not DType - */ -template ::value> -class TrainableLayer : public Layer { - // FP32 input/output but lower precision weight don't make much sense. - static_assert(!(std::is_same::value && use_FP32_weight == false)); - - protected: - // Why WeightType is protected? - // it is convenient for a child trainable to access the weight type, - // especially if it wants to use FP32 weights but inputs/outputs the lower precision data. - // A typical example is when DType is __half but use_FP32_weight is true. - // Then, the child class should define the following alias to make their code cleaner: - // (1) using Base = TrainableLayer; - // (2) using WeightType = typename Base::WeightType; - // If useFP32_weight is false, the aliases are not necessary. - using WeightType = typename std::conditional::type; - - private: - Tensors2 master_weights_; - Tensors2 weights_; - Tensors2 wgrads_; - const std::shared_ptr> master_weight_buff_; - const std::shared_ptr> weight_buff_; - const std::shared_ptr> wgrad_buff_; - // Layer initializers. - // if the layer need a specific weight initialization, override each function accordingly. - virtual std::unique_ptr get_zero_initializer(const int index) override { - return std::make_unique(0.0f); - } - virtual std::unique_ptr get_uniform_initializer(const int index) override { - return std::move(get_default_initializer(index)); - }; - virtual std::unique_ptr get_xavier_uniform_initializer(const int index) override { - return std::move(get_default_initializer(index)); - }; - virtual std::unique_ptr get_xavier_norm_initializer(const int index) override { - return std::move(get_default_initializer(index)); - }; - virtual std::unique_ptr get_default_initializer(const int index) override { - return std::move(get_zero_initializer(index)); - }; - - protected: - // @brief a modifier to reserve a weight tensor at idx with the specified dims. - // @details - // Usage: In a child class, this->set_weight(0, dims); - void set_weight(size_t idx, const std::vector& dimensions) { - HCTR_CHECK_HINT(weights_.size() == idx, "Wrong index for setting weight tensors"); - - Tensor2 tensor; - weight_buff_->reserve(dimensions, &tensor); - weights_.push_back(tensor); - - // master weights are used only when compute weights have lower precision - if constexpr (!use_FP32_weight) { - HCTR_CHECK_HINT(master_weights_.size() == idx, - "Wrong index for setting master weight tensors"); - - Tensor2 tensor; - master_weight_buff_->reserve(dimensions, &tensor); - master_weights_.push_back(tensor); - } - } - // @brief a modifier to reserve a weight tensor at idx with the specified dims. - // @details - // Usage: In a child class, this->set_wgrad(0, dims); - void set_wgrad(size_t idx, const std::vector& dimensions) { - HCTR_CHECK_HINT(wgrads_.size() == idx, "Wrong index for setting weight gradient tensors"); - - Tensor2 tensor; - wgrad_buff_->reserve(dimensions, &tensor); - wgrads_.push_back(tensor); - } - // @brief an accessor to get a weight tensor at idx - // @details - // Usage: In a child class, auto weight2 = this->get_weight(2); - auto& get_weight(size_t idx) { - HCTR_CHECK_HINT(idx < weights_.size(), "Wrong index for getting weight tensors"); - return weights_[idx]; - } - // @brief an accessor to get a wgrad tensor at idx - // @details - // Usage: In a child class, auto wgrad2 = this->get_wgrad(2); - auto& get_wgrad(size_t idx) { - HCTR_CHECK_HINT(idx < wgrads_.size(), "Wrong index for getting weight gradient tensors"); - return wgrads_[idx]; - } - - public: - // @brief a parameter initialization function - // @details - // init_params calls the specific initializers to initialize parameters. The types of initializers - // are specified by initializer_types_. - void init_params(const curandGenerator_t& generator) override; - - /** - * Ctor of TrainableLayer. - * @param master_weight_buff the buffer to reserve master weight tensors, used only if WeightType - * is not FP32. - * @param weight_buff the buffer to reserve weight tensors - * @param wgrad_buff the buffer to reserve weight gradient tensors - * @param gpu_resource the abstraction of GPU where this dense layer resides - * @param initializer_types the list of initializer types of all weight tensors - */ - TrainableLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()) - : Layer(gpu_resource, initializer_types), - // if WeightType is float, master weights are not used at all - master_weight_buff_(std::is_same::value ? nullptr : master_weight_buff), - weight_buff_(weight_buff), - wgrad_buff_(wgrad_buff) {} -}; - -template -void TrainableLayer::init_params(const curandGenerator_t& generator) { - std::shared_ptr> buff = - GeneralBuffer2::create(); - std::shared_ptr> block = buff->create_block(); - - Tensors2 weights = master_weights_; - if constexpr (std::is_same::value && use_FP32_weight) { - weights = weights_; - } - - Tensors2 weight_cpu_tensors; - for (const Tensor2& weight : weights) { - Tensor2 tensor; - block->reserve(weight.get_dimensions(), &tensor); - weight_cpu_tensors.push_back(tensor); - } - - buff->allocate(); - - std::vector> simulators; - // each weight has its own initializer - for (int index = 0; index < static_cast(weights.size()); ++index) { - switch (initializer_types_[index % initializer_types_.size()]) { - case Initializer_t::Uniform: { - simulators.push_back(get_uniform_initializer(index)); - break; - } - case Initializer_t::XavierNorm: { - simulators.push_back(get_xavier_norm_initializer(index)); - break; - } - case Initializer_t::XavierUniform: { - simulators.push_back(get_xavier_uniform_initializer(index)); - break; - } - case Initializer_t::Zero: { - simulators.push_back(get_zero_initializer(index)); - break; - } - case Initializer_t::Default: { - simulators.push_back(get_default_initializer(index)); - break; - } - default: { - HCTR_OWN_THROW(Error_t::OutOfBound, "Not supported initializer."); - break; - } - } - } - - for (size_t i = 0; i < weights.size(); ++i) { - simulators[i]->fill(weight_cpu_tensors[i], generator); - HCTR_LIB_THROW(cudaMemcpyAsync(weights[i].get_ptr(), weight_cpu_tensors[i].get_ptr(), - weights[i].get_size_in_bytes(), cudaMemcpyHostToDevice, - get_gpu().get_stream())); - } -} /** * @brief @@ -209,7 +30,7 @@ void TrainableLayer::init_params(const curandGenerator_t * @tparams use_FP32_weight if specified, the weight data type is in FP32, not DType */ template ::value> -class Core23TempTrainableLayer : public Layer { +class TrainableLayer : public Layer { // FP32 input/output but lower precision weight don't make much sense. static_assert(!(std::is_same::value && use_FP32_weight == false)); @@ -310,11 +131,10 @@ class Core23TempTrainableLayer : public Layer { * @param gpu_resource the abstraction of GPU where this dense layer resides * @param initializer_types the list of initializer types of all weight tensors */ - Core23TempTrainableLayer( - const std::vector& input_tensors, - const std::vector& output_tensors, - const std::shared_ptr& gpu_resource, - std::vector initializer_types = std::vector()) + TrainableLayer(const std::vector& input_tensors, + const std::vector& output_tensors, + const std::shared_ptr& gpu_resource, + std::vector initializer_types = std::vector()) : Layer(input_tensors, output_tensors, gpu_resource, initializer_types), master_weights_params_(core23::TensorParams() .alignment(sizeof(float)) @@ -339,8 +159,7 @@ class Core23TempTrainableLayer : public Layer { }; template -void Core23TempTrainableLayer::init_params( - const curandGenerator_t& generator) { +void TrainableLayer::init_params(const curandGenerator_t& generator) { std::vector weights = master_weights_; if constexpr (std::is_same::value && use_FP32_weight) { weights = weights_; diff --git a/HugeCTR/src/layers/batch_norm_layer.cu b/HugeCTR/src/layers/batch_norm_layer.cu index ceb3ef107c..9c18b77182 100644 --- a/HugeCTR/src/layers/batch_norm_layer.cu +++ b/HugeCTR/src/layers/batch_norm_layer.cu @@ -29,206 +29,10 @@ using ToStringType = typename std::conditional::value, f } template -BatchNormLayer::BatchNormLayer(const std::shared_ptr>& master_weight_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, +BatchNormLayer::BatchNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params, const std::shared_ptr& gpu_resource, std::vector initializer_types) - : Base(master_weight_buff, weight_buff, wgrad_buff, gpu_resource, initializer_types), - params_(params), - mode_(CUDNN_BATCHNORM_PER_ACTIVATION) { - CudaDeviceContext context(this->get_device_id()); - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - - assert(get_size_from_dims(in_tensor_dim) == get_size_from_dims(out_tensor_dim)); - assert(in_tensor_dim.size() == 2 && out_tensor_dim.size() == 2); - assert(in_tensor_dim[0] == out_tensor_dim[0]); - assert(in_tensor_dim[1] == out_tensor_dim[1]); - - HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&in_out_desc_)); - - size_t num_feature = in_tensor_dim[1]; - int batch_size = in_tensor_dim[0]; - - cudnnDataType_t data_type = std::is_same::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT; - int n_stride = num_feature; - int w_stride = 1; - - HCTR_LIB_THROW(cudnnSetTensor4dDescriptorEx(in_out_desc_, data_type, batch_size, 1, 1, - num_feature, n_stride, 1, 1, w_stride)); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - - HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&gamma_beta_desc_)); - - HCTR_LIB_THROW(cudnnDeriveBNTensorDescriptor(gamma_beta_desc_, in_out_desc_, mode_)); - - std::vector gamma_dim = {num_feature, 1}; - - // gamma & beta - this->set_weight(0, gamma_dim); - this->set_weight(1, gamma_dim); - - gamma_ = this->get_weight(0); - beta_ = this->get_weight(1); - // gamma grad & beta grad - this->set_wgrad(0, gamma_dim); - this->set_wgrad(1, gamma_dim); - gamma_grad_ = this->get_wgrad(0); - beta_grad_ = this->get_wgrad(1); - - // result running mean & var - blob_buff->reserve(gamma_dim, &result_running_mean_); - blob_buff->reserve(gamma_dim, &result_running_var_); - - // save running mean & var (cache) - blob_buff->reserve(gamma_dim, &result_save_mean_); - blob_buff->reserve(gamma_dim, &result_save_inv_var_); -} - -template -BatchNormLayer::~BatchNormLayer() { - try { - HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(in_out_desc_)); - HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(gamma_beta_desc_)); - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - } -} - -template -void BatchNormLayer::initialize() { - // host array to get running mean & var - - size_t num_feature = in_tensors_[0].get_dimensions()[1]; - - std::shared_ptr> internal_host_buf = - GeneralBuffer2::create(); - - internal_host_buf->reserve({num_feature}, &h_result_running_mean_); - internal_host_buf->reserve({num_feature}, &h_result_running_var_); - - internal_host_buf->allocate(); -} - -template -void BatchNormLayer::fprop(bool is_train) { - CudaDeviceContext context(this->get_device_id()); - float one = 1.0f, zero = 0.0f; - - Tensor2& in_tensor = in_tensors_[0]; - Tensor2& out_tensor = out_tensors_[0]; - T* in = in_tensor.get_ptr(); - T* out = out_tensor.get_ptr(); - - float* gamma = gamma_.get_ptr(); - float* beta = beta_.get_ptr(); - - float* result_running_mean = result_running_mean_.get_ptr(); - float* result_running_var = result_running_var_.get_ptr(); - float* result_save_mean = result_save_mean_.get_ptr(); - float* result_save_inv_var = result_save_inv_var_.get_ptr(); - - if (is_train) { - HCTR_LIB_THROW(cudnnBatchNormalizationForwardTraining( - this->get_gpu().get_cudnn_handle(), mode_, &one, &zero, in_out_desc_, in, in_out_desc_, out, - gamma_beta_desc_, gamma, beta, params_.factor, result_running_mean, result_running_var, - params_.eps, result_save_mean, result_save_inv_var)); - } else { - HCTR_LIB_THROW(cudnnBatchNormalizationForwardInference( - this->get_gpu().get_cudnn_handle(), mode_, &one, &zero, in_out_desc_, in, in_out_desc_, out, - gamma_beta_desc_, gamma, beta, result_running_mean, result_running_var, params_.eps)); - } -} - -template -void BatchNormLayer::bprop() { - CudaDeviceContext context(this->get_device_id()); - - float one = 1.0f, zero = 0.0f; - - Tensor2& in_tensor = in_tensors_[0]; - Tensor2& out_tensor = out_tensors_[0]; - T* in = in_tensor.get_ptr(); - T* out = out_tensor.get_ptr(); - - float* gamma = gamma_.get_ptr(); - - float* gamma_grad = gamma_grad_.get_ptr(); - float* beta_grad = beta_grad_.get_ptr(); - - float* result_save_mean = result_save_mean_.get_ptr(); - float* result_save_inv_var = result_save_inv_var_.get_ptr(); - - HCTR_LIB_THROW(cudnnBatchNormalizationBackward( - this->get_gpu().get_cudnn_handle(), mode_, &one, &zero, &one, &zero, in_out_desc_, in, - in_out_desc_, out, in_out_desc_, in, gamma_beta_desc_, gamma, gamma_grad, beta_grad, - params_.eps, result_save_mean, result_save_inv_var)); -} - -template -std::string BatchNormLayer::get_no_trained_params_in_string() { - float* d_result_running_mean = result_running_mean_.get_ptr(); - float* d_result_running_var = result_running_var_.get_ptr(); - size_t n_byte = result_running_mean_.get_size_in_bytes(); - size_t n_elem = n_byte / sizeof(T); - - HCTR_LIB_THROW(cudaMemcpy(h_result_running_mean_.get_ptr(), d_result_running_mean, n_byte, - cudaMemcpyDeviceToHost)); - HCTR_LIB_THROW(cudaMemcpy(h_result_running_var_.get_ptr(), d_result_running_var, n_byte, - cudaMemcpyDeviceToHost)); - - std::string result = " \"type\": \"BatchNorm\",\n"; - result += " \"mean\": ["; - for (size_t i = 0; i < n_elem; i++) { - result += std::to_string(ToStringType(h_result_running_mean_.get_ptr()[i])); - if (i != (n_elem - 1)) result += ", "; - } - result += "],\n"; - - result += " \"var\": ["; - for (size_t i = 0; i < n_elem; i++) { - result += std::to_string(ToStringType(h_result_running_var_.get_ptr()[i])); - if (i != (n_elem - 1)) result += ", "; - } - result += "]"; - - return result; -} - -template -std::vector BatchNormLayer::get_tensors_for_non_trainable_params() { - std::vector tensors; - tensors.push_back(result_running_mean_.shrink()); - tensors.push_back(result_running_var_.shrink()); - return tensors; -} - -template -std::unique_ptr BatchNormLayer::get_default_initializer(const int index) { - std::unique_ptr simu; - if (0 == index) { - simu.reset(new ConstantDataSimulator(1.0f)); - } else if (1 == index) { - simu.reset(new ConstantDataSimulator(0.0f)); - } else { - HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}."); - } - return simu; -} - -template class BatchNormLayer; -template class BatchNormLayer<__half>; - -template -Core23TempBatchNormLayer::Core23TempBatchNormLayer( - const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params, - const std::shared_ptr& gpu_resource, std::vector initializer_types) : Base({in_tensor}, {out_tensor}, gpu_resource, initializer_types), params_(params), mode_(CUDNN_BATCHNORM_PER_ACTIVATION) { @@ -303,7 +107,7 @@ Core23TempBatchNormLayer::Core23TempBatchNormLayer( } template -Core23TempBatchNormLayer::~Core23TempBatchNormLayer() { +BatchNormLayer::~BatchNormLayer() { try { HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(in_out_desc_)); HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(gamma_beta_desc_)); @@ -313,7 +117,7 @@ Core23TempBatchNormLayer::~Core23TempBatchNormLayer() { } template -void Core23TempBatchNormLayer::initialize() { +void BatchNormLayer::initialize() { // host array to get running mean & var int64_t num_feature = this->input_tensors_[0].shape().size(1); @@ -330,7 +134,7 @@ void Core23TempBatchNormLayer::initialize() { } template -void Core23TempBatchNormLayer::fprop(bool is_train) { +void BatchNormLayer::fprop(bool is_train) { CudaDeviceContext context(this->get_device_id()); float one = 1.0f, zero = 0.0f; @@ -360,7 +164,7 @@ void Core23TempBatchNormLayer::fprop(bool is_train) { } template -void Core23TempBatchNormLayer::bprop() { +void BatchNormLayer::bprop() { CudaDeviceContext context(this->get_device_id()); float one = 1.0f, zero = 0.0f; @@ -385,7 +189,7 @@ void Core23TempBatchNormLayer::bprop() { } template -std::string Core23TempBatchNormLayer::get_no_trained_params_in_string() { +std::string BatchNormLayer::get_no_trained_params_in_string() { float* d_result_running_mean = result_running_mean_.data(); float* d_result_running_var = result_running_var_.data(); int64_t n_byte = result_running_mean_.num_bytes(); @@ -416,13 +220,12 @@ std::string Core23TempBatchNormLayer::get_no_trained_params_in_string() { } template -std::vector Core23TempBatchNormLayer::get_non_trainable_params_as_tensors() { +std::vector BatchNormLayer::get_non_trainable_params_as_tensors() { return {result_running_mean_, result_running_var_}; } template -std::unique_ptr Core23TempBatchNormLayer::get_default_initializer( - const int index) { +std::unique_ptr BatchNormLayer::get_default_initializer(const int index) { std::unique_ptr simu; if (0 == index) { simu.reset(new ConstantDataSimulator(1.0f)); @@ -434,6 +237,6 @@ std::unique_ptr Core23TempBatchNormLayer::get_default_initiali return simu; } -template class Core23TempBatchNormLayer; -template class Core23TempBatchNormLayer<__half>; +template class BatchNormLayer; +template class BatchNormLayer<__half>; } // namespace HugeCTR diff --git a/HugeCTR/src/layers/fully_connected_layer.cu b/HugeCTR/src/layers/fully_connected_layer.cu index 62847d3779..a21ce3fe84 100644 --- a/HugeCTR/src/layers/fully_connected_layer.cu +++ b/HugeCTR/src/layers/fully_connected_layer.cu @@ -53,353 +53,12 @@ void add_bias(float* data, const float* bias, const int m, const int n, bool row } // namespace -FullyConnectedLayer::FullyConnectedLayer( - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, const Tensor2& in_tensor, - const Tensor2& out_tensor, const std::shared_ptr& gpu_resource, - bool use_mixed_precision, bool enable_tf32_compute, - std::vector initializer_types) - : TrainableLayer(weight_buff, weight_buff, wgrad_buff, gpu_resource, initializer_types), - use_mixed_precision_(use_mixed_precision), - enable_tf32_compute_(enable_tf32_compute) { - try { - // check the in_tensor and out_tensor - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - // 1. input and output have the same dim - if (in_tensor_dim.size() != out_tensor_dim.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "input and output tensor don't have same dimensions"); - } - // 2. dim match? - size_t in_batch_size = 1; - size_t out_batch_size = 1; - size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1]; - size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * in_tensor_dim[idx]; - out_batch_size = out_batch_size * out_tensor_dim[idx]; - } - - if (in_batch_size != out_batch_size) { - HCTR_OWN_THROW(Error_t::WrongInput, "size of input / output tensor doesn't match"); - } - - std::vector weight_dim = {input_size, output_size}; - std::vector bias_dim = {1, output_size}; - - this->set_weight(0, weight_dim); - this->set_weight(1, bias_dim); - this->set_wgrad(0, weight_dim); - this->set_wgrad(1, bias_dim); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - // Where should we create this cuBLAS handle? - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -void FullyConnectedLayer::fprop(bool is_train) { - CudaDeviceContext context(get_device_id()); - - Tensor2& in_tensor = get_in_tensors(is_train)[0]; - Tensor2& out_tensor = out_tensors_[0]; - - float* weight = this->get_weight(0).get_ptr(); - float* bias = this->get_weight(1).get_ptr(); - float* in = in_tensor.get_ptr(); - float* out = out_tensor.get_ptr(); - - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - - size_t in_batch_size = 1; - size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1]; - size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * in_tensor_dim[idx]; - } - - float alpha = 1.0f, beta = 0.0f; - - const cublasComputeType_t compute_type = - enable_tf32_compute_ ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - in_batch_size, input_size, &alpha, weight, CUDA_R_32F, output_size, - in, CUDA_R_32F, input_size, &beta, out, CUDA_R_32F, output_size, - compute_type, falgo_)); - add_bias(out, bias, in_batch_size, output_size, true, get_gpu().get_stream()); -} - -void FullyConnectedLayer::bprop() { - CudaDeviceContext context(get_device_id()); - - Tensor2& in_tensor = get_in_tensors(true)[0]; - Tensor2& out_tensor = out_tensors_[0]; - - float* wgrad = this->get_wgrad(0).get_ptr(); - float* bias_grad = this->get_wgrad(1).get_ptr(); - float* weight = this->get_weight(0).get_ptr(); - float* in = in_tensor.get_ptr(); - float* out = out_tensor.get_ptr(); - - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - - size_t in_batch_size = 1; - size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1]; - size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * in_tensor_dim[idx]; - } - - float alpha = 1.0f, beta_w = 1.0f, beta_x = 0.0f; - - const cublasComputeType_t compute_type = - enable_tf32_compute_ ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - - // gradient respect to W - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, in_batch_size, &alpha, out, CUDA_R_32F, output_size, in, - CUDA_R_32F, input_size, &beta_w, wgrad, CUDA_R_32F, output_size, - compute_type, balgo_W_)); - // gradient respect to Xn - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - in_batch_size, output_size, &alpha, weight, CUDA_R_32F, output_size, - out, CUDA_R_32F, output_size, &beta_x, in, CUDA_R_32F, input_size, - compute_type, balgo_Xn_)); - MLCommon::LinAlg::reduce(bias_grad, out, in_batch_size, output_size, float(0), false, true, - get_gpu().get_stream(), true); -} - -void FullyConnectedLayer::search_algorithm() { - // Set to the CUDA device where this layer assigned to - CudaDeviceContext context(get_device_id()); - - const int repeat_num = 100; - - // Device Tensors to be used - Tensor2& in_tensor = get_in_tensors(true)[0]; - Tensor2& out_tensor = out_tensors_[0]; - float* weight = this->get_weight(0).get_ptr(); - float* in = in_tensor.get_ptr(); - float* out = out_tensor.get_ptr(); - float* wgrad = this->get_wgrad(0).get_ptr(); - - // Tensor dim - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - - size_t in_batch_size = 1; - size_t out_batch_size = 1; - size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1]; - size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * in_tensor_dim[idx]; - out_batch_size = out_batch_size * out_tensor_dim[idx]; - } - - // Record time for each algorithm - float shortestTime = 100000000.0; - float time; - cudaEvent_t start, stop; - HCTR_LIB_THROW(cudaEventCreate(&start)); - HCTR_LIB_THROW(cudaEventCreate(&stop)); - - // cublas ret status - cublasStatus_t status; - - // Start, end for search - int startAlgo, endAlgo; - if (use_mixed_precision_) { - startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP; - } else { - startAlgo = CUBLAS_GEMM_DEFAULT; - endAlgo = CUBLAS_GEMM_ALGO23; - } - - const cublasComputeType_t compute_type = - enable_tf32_compute_ ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - - // Search all the algorithm for fprop - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - float alpha = 1.0f, beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (int i = 0; i < repeat_num; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - in_batch_size, input_size, &alpha, weight, CUDA_R_32F, output_size, in, - CUDA_R_32F, input_size, &beta, out, CUDA_R_32F, output_size, - compute_type, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - falgo_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = 100000000.0; - - // Search all the algorithm for bprop_W - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - float alpha = 1.0f, beta_w = 1.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (int i = 0; i < repeat_num; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, in_batch_size, &alpha, out, CUDA_R_32F, output_size, in, - CUDA_R_32F, input_size, &beta_w, wgrad, CUDA_R_32F, output_size, - compute_type, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_W_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = 100000000.0; - - // Search all the algorithm for bprop_Xn - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - float alpha = 1.0f, beta_x = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (int i = 0; i < repeat_num; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - in_batch_size, output_size, &alpha, weight, CUDA_R_32F, output_size, - out, CUDA_R_32F, output_size, &beta_x, in, CUDA_R_32F, input_size, - compute_type, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_Xn_ = static_cast(testAlgo); - } - } - - // Print selection information - // HCTR_LOG(INFO, WORLD, "The algorithm selection for fprop, bprop_W and bprop_Xn are: %d, %d and - // %d.\n", - // (int)falgo_, (int)balgo_W_, (int)balgo_Xn_); - - // Output msg - // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas - // Gemm.\n"); Clean-up - HCTR_LIB_THROW(cudaEventDestroy(start)); - HCTR_LIB_THROW(cudaEventDestroy(stop)); -} - -std::unique_ptr FullyConnectedLayer::get_uniform_initializer( - const int index) { - const Tensor2& in_tensor = get_in_tensors(true)[0]; - const Tensor2& out_tensor = out_tensors_[0]; - float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1]; - float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1]; - - float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim); - return std::make_unique(-1 * limit, limit); -} - -std::unique_ptr FullyConnectedLayer::get_xavier_uniform_initializer( - const int index) { - const Tensor2& in_tensor = get_in_tensors(true)[0]; - const Tensor2& out_tensor = out_tensors_[0]; - float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1]; - float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Uniform, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FullyConnectedLayer::get_xavier_norm_initializer( - const int index) { - const Tensor2& in_tensor = get_in_tensors(true)[0]; - const Tensor2& out_tensor = out_tensors_[0]; - float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1]; - float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FullyConnectedLayer::get_default_initializer( - const int index) { - const Tensor2& in_tensor = get_in_tensors(true)[0]; - const Tensor2& out_tensor = out_tensors_[0]; - float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1]; - float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1]; - - std::unique_ptr simu(nullptr); - if (0 == index) { - simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, bottom_dim, top_dim)); - } else if (1 == index) { - float stddev = sqrt(1.f / top_dim); - simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev)); - } else { - HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}."); - } - - return simu; -} - -template class FullyConnectedLayer; - -Core23TempFullyConnectedLayer::Core23TempFullyConnectedLayer( - const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, - const std::shared_ptr& gpu_resource, bool use_mixed_precision, - bool enable_tf32_compute, std::vector initializer_types) - : Core23TempTrainableLayer({in_tensor}, {out_tensor}, gpu_resource, initializer_types), +FullyConnectedLayer::FullyConnectedLayer(const core23::Tensor& in_tensor, + const core23::Tensor& out_tensor, + const std::shared_ptr& gpu_resource, + bool use_mixed_precision, bool enable_tf32_compute, + std::vector initializer_types) + : TrainableLayer({in_tensor}, {out_tensor}, gpu_resource, initializer_types), use_mixed_precision_(use_mixed_precision), enable_tf32_compute_(enable_tf32_compute) { try { @@ -439,7 +98,7 @@ Core23TempFullyConnectedLayer::Core23TempFullyConnectedLayer( } } -void Core23TempFullyConnectedLayer::fprop(bool is_train) { +void FullyConnectedLayer::fprop(bool is_train) { CudaDeviceContext context(get_device_id()); core23::Tensor& in_tensor = get_in_tensors(is_train)[0]; @@ -473,7 +132,7 @@ void Core23TempFullyConnectedLayer::fprop(bool is_train) { add_bias(out, bias, in_batch_size, output_size, true, get_gpu().get_stream()); } -void Core23TempFullyConnectedLayer::bprop() { +void FullyConnectedLayer::bprop() { CudaDeviceContext context(get_device_id()); core23::Tensor& in_tensor = get_in_tensors(true)[0]; @@ -515,7 +174,7 @@ void Core23TempFullyConnectedLayer::bprop() { get_gpu().get_stream(), true); } -void Core23TempFullyConnectedLayer::search_algorithm() { +void FullyConnectedLayer::search_algorithm() { // Set to the CUDA device where this layer assigned to CudaDeviceContext context(get_device_id()); @@ -674,7 +333,7 @@ void Core23TempFullyConnectedLayer::search_algorithm() { HCTR_LIB_THROW(cudaEventDestroy(stop)); } -std::unique_ptr Core23TempFullyConnectedLayer::get_uniform_initializer( +std::unique_ptr FullyConnectedLayer::get_uniform_initializer( const int index) { const core23::Tensor& in_tensor = get_in_tensors(true)[0]; const core23::Tensor& out_tensor = this->output_tensors_[0]; @@ -685,7 +344,7 @@ std::unique_ptr Core23TempFullyConnectedLayer::get_uniform return std::make_unique(-1 * limit, limit); } -std::unique_ptr Core23TempFullyConnectedLayer::get_xavier_uniform_initializer( +std::unique_ptr FullyConnectedLayer::get_xavier_uniform_initializer( const int index) { const core23::Tensor& in_tensor = get_in_tensors(true)[0]; const core23::Tensor& out_tensor = this->output_tensors_[0]; @@ -697,7 +356,7 @@ std::unique_ptr Core23TempFullyConnectedLayer::get_xavier_ 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFullyConnectedLayer::get_xavier_norm_initializer( +std::unique_ptr FullyConnectedLayer::get_xavier_norm_initializer( const int index) { const core23::Tensor& in_tensor = get_in_tensors(true)[0]; const core23::Tensor& out_tensor = this->output_tensors_[0]; @@ -709,7 +368,7 @@ std::unique_ptr Core23TempFullyConnectedLayer::get_xavier_ 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFullyConnectedLayer::get_default_initializer( +std::unique_ptr FullyConnectedLayer::get_default_initializer( const int index) { const core23::Tensor& in_tensor = get_in_tensors(true)[0]; const core23::Tensor& out_tensor = this->output_tensors_[0]; @@ -730,6 +389,6 @@ std::unique_ptr Core23TempFullyConnectedLayer::get_default return simu; } -template class Core23TempFullyConnectedLayer; +template class FullyConnectedLayer; } // namespace HugeCTR diff --git a/HugeCTR/src/layers/fully_connected_layer_half.cu b/HugeCTR/src/layers/fully_connected_layer_half.cu index 462fd319a3..28bfd821a2 100644 --- a/HugeCTR/src/layers/fully_connected_layer_half.cu +++ b/HugeCTR/src/layers/fully_connected_layer_half.cu @@ -20,437 +20,11 @@ namespace HugeCTR { -FullyConnectedLayer<__half>::FullyConnectedLayer( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor, - const std::shared_ptr& gpu_resource, std::vector initializer_types) - : TrainableLayer<__half>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource, - initializer_types), - falgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP) { - const auto& bottom_tensor_dim = bottom_tensor.get_dimensions(); - const auto& top_tensor_dim = top_tensor.get_dimensions(); - - if (bottom_tensor_dim.size() != top_tensor_dim.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor don't have same dimensions"); - } - size_t in_batch_size = 1; - size_t out_batch_size = 1; - size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1]; - size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * bottom_tensor_dim[idx]; - out_batch_size = out_batch_size * top_tensor_dim[idx]; - } - - if (in_batch_size != out_batch_size) { - HCTR_OWN_THROW(Error_t::WrongInput, "size of input / output tensor doesn't match"); - } - - std::vector kernel_dim = {input_size, output_size}; - std::vector bias_dim = {1, output_size}; - std::vector identity_dim = {1, in_batch_size}; - - this->set_weight(0, kernel_dim); - this->set_weight(1, bias_dim); - this->set_wgrad(0, kernel_dim); - this->set_wgrad(1, bias_dim); - - blobs_buff->reserve(identity_dim, &identity_tensor_); - - bottom_tensor_ = bottom_tensor; - top_tensor_ = top_tensor; -} - -void FullyConnectedLayer<__half>::fprop(bool is_train) { - CudaDeviceContext context(get_device_id()); - - const __half* kernel = this->get_weight(0).get_ptr(); - const __half* bias = this->get_weight(1).get_ptr(); - const __half* bottom = get_bottom_tensor(is_train).get_ptr(); - const __half* identity = identity_tensor_.get_ptr(); - __half* top = top_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor(is_train).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t in_batch_size = 1; - size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1]; - size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * bottom_tensor_dim[idx]; - } - - const float alpha = 1.0f; - const float beta_b = 0.0f; - const float beta_k = 1.0f; - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - in_batch_size, 1, &alpha, bias, CUDA_R_16F, output_size, identity, - CUDA_R_16F, 1, &beta_b, top, CUDA_R_16F, output_size, CUDA_R_32F, - falgo_b_)); - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - in_batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size, - bottom, CUDA_R_16F, input_size, &beta_k, top, CUDA_R_16F, output_size, - CUDA_R_32F, falgo_k_)); -} - -void FullyConnectedLayer<__half>::bprop() { - CudaDeviceContext context(get_device_id()); - - const __half* kernel = this->get_weight(0).get_ptr(); - const __half* top = top_tensor_.get_ptr(); - const __half* identity = identity_tensor_.get_ptr(); - __half* kernel_grad = this->get_wgrad(0).get_ptr(); - __half* bias_grad = this->get_wgrad(1).get_ptr(); - __half* bottom = get_bottom_tensor(true).get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t in_batch_size = 1; - size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1]; - size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * bottom_tensor_dim[idx]; - } - - const float alpha = 1.0f; - const float beta_b = 0.0f; - const float beta_k = 1.0f; - const float beta_x = 0.0f; - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - 1, in_batch_size, &alpha, top, CUDA_R_16F, output_size, identity, - CUDA_R_16F, in_batch_size, &beta_b, bias_grad, CUDA_R_16F, - output_size, CUDA_R_32F, balgo_b_)); - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, in_batch_size, &alpha, top, CUDA_R_16F, output_size, - bottom, CUDA_R_16F, input_size, &beta_k, kernel_grad, CUDA_R_16F, - output_size, CUDA_R_32F, balgo_k_)); - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - in_batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, - top, CUDA_R_16F, output_size, &beta_x, bottom, CUDA_R_16F, input_size, - CUDA_R_32F, balgo_x_)); -} - -void FullyConnectedLayer<__half>::initialize() { - CudaDeviceContext context(get_device_id()); - - __half* identity = identity_tensor_.get_ptr(); - const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions(); - size_t m = 1; - for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) { - m = m * bottom_tensor_dim[idx]; - } - // Initialize identity vector - initialize_array<<<(m - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>(identity, m, - __float2half(1.0f)); -} - -void FullyConnectedLayer<__half>::search_algorithm() { - // Set to the CUDA device where this layer assigned to - CudaDeviceContext context(get_device_id()); - - const size_t repeat_num = 100; - - // Device Tensors to be used - __half* bottom = get_bottom_tensor(true).get_ptr(); - __half* top = top_tensor_.get_ptr(); - __half* identity = identity_tensor_.get_ptr(); - __half* kernel = this->get_weight(0).get_ptr(); - __half* bias = this->get_weight(1).get_ptr(); - __half* kernel_grad = this->get_wgrad(0).get_ptr(); - __half* bias_grad = this->get_wgrad(1).get_ptr(); - - // Tensor dim - const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t in_batch_size = 1; - size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1]; - size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1]; - - for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) { - in_batch_size = in_batch_size * bottom_tensor_dim[idx]; - } - - // Record time for each algorithm - float shortestTime = std::numeric_limits::max(); - float time; - cudaEvent_t start, stop; - HCTR_LIB_THROW(cudaEventCreate(&start)); - HCTR_LIB_THROW(cudaEventCreate(&stop)); - - // Start, end for search - const cublasGemmAlgo_t startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; - const cublasGemmAlgo_t endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP; - - // Search all the algorithm for falgo_b_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - in_batch_size, 1, &alpha, bias, CUDA_R_16F, output_size, identity, - CUDA_R_16F, 1, &beta, top, CUDA_R_16F, output_size, CUDA_R_32F, - static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop_b, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - falgo_b_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for falgo_k_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 1.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - in_batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size, - bottom, CUDA_R_16F, input_size, &beta, top, CUDA_R_16F, output_size, - CUDA_R_32F, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - falgo_k_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_b_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, 1, - in_batch_size, &alpha, top, CUDA_R_16F, output_size, identity, - CUDA_R_16F, in_batch_size, &beta, bias_grad, CUDA_R_16F, output_size, - CUDA_R_32F, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_b_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_k_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 1.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, in_batch_size, &alpha, top, CUDA_R_16F, output_size, bottom, - CUDA_R_16F, input_size, &beta, kernel_grad, CUDA_R_16F, output_size, - CUDA_R_32F, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_k_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_x_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - in_batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, - top, CUDA_R_16F, output_size, &beta, bottom, CUDA_R_16F, input_size, - CUDA_R_32F, static_cast(testAlgo)); - } - - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_x_ = static_cast(testAlgo); - } - } - - // Print selection information - // HCTR_LOG(INFO, WORLD, - // "The algorithm selection for falgo_b_, falgo_k_, balgo_b_, balgo_k_, balgo_x_ are: %d, %d, - // " - // "%d, %d and %d.\n", - // (int)falgo_b_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)falgo_k_ - - // CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)balgo_b_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)balgo_k_ - // - CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)balgo_x_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - - // Output msg - // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas - // Gemm.\n"); Clean-up - HCTR_LIB_THROW(cudaEventDestroy(start)); - HCTR_LIB_THROW(cudaEventDestroy(stop)); -} // namespace HugeCTR - -std::unique_ptr FullyConnectedLayer<__half>::get_uniform_initializer( - const int index) { - size_t bottom_dim = - get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1]; - size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1]; - - float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim); - return std::make_unique(-1 * limit, limit); -} - -std::unique_ptr FullyConnectedLayer<__half>::get_xavier_uniform_initializer( - const int index) { - size_t bottom_dim = - get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1]; - size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Uniform, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FullyConnectedLayer<__half>::get_xavier_norm_initializer( - const int index) { - size_t bottom_dim = - get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1]; - size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FullyConnectedLayer<__half>::get_default_initializer( - const int index) { - size_t bottom_dim = - get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1]; - size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1]; - - std::unique_ptr simu(nullptr); - if (0 == index) { - simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, bottom_dim, top_dim)); - } else if (1 == index) { - float stddev = sqrt(1.f / top_dim); - simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev)); - } else { - HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}."); - } - - return simu; -} - -template class FullyConnectedLayer<__half>; - -Core23TempFullyConnectedLayer<__half>::Core23TempFullyConnectedLayer( - const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor, - const std::shared_ptr& gpu_resource, std::vector initializer_types) - : Core23TempTrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource, - initializer_types), +FullyConnectedLayer<__half>::FullyConnectedLayer(const core23::Tensor& bottom_tensor, + const core23::Tensor& top_tensor, + const std::shared_ptr& gpu_resource, + std::vector initializer_types) + : TrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource, initializer_types), falgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), @@ -496,7 +70,7 @@ Core23TempFullyConnectedLayer<__half>::Core23TempFullyConnectedLayer( .buffer_params(blobs_buffer_params)); } -void Core23TempFullyConnectedLayer<__half>::fprop(bool is_train) { +void FullyConnectedLayer<__half>::fprop(bool is_train) { CudaDeviceContext context(get_device_id()); const __half* kernel = this->get_weight(0).data<__half>(); @@ -532,7 +106,7 @@ void Core23TempFullyConnectedLayer<__half>::fprop(bool is_train) { CUDA_R_32F, falgo_k_)); } -void Core23TempFullyConnectedLayer<__half>::bprop() { +void FullyConnectedLayer<__half>::bprop() { CudaDeviceContext context(get_device_id()); const __half* kernel = this->get_weight(0).data<__half>(); @@ -575,7 +149,7 @@ void Core23TempFullyConnectedLayer<__half>::bprop() { CUDA_R_32F, balgo_x_)); } -void Core23TempFullyConnectedLayer<__half>::initialize() { +void FullyConnectedLayer<__half>::initialize() { CudaDeviceContext context(get_device_id()); __half* identity = identity_tensor_.data<__half>(); @@ -589,7 +163,7 @@ void Core23TempFullyConnectedLayer<__half>::initialize() { __float2half(1.0f)); } -void Core23TempFullyConnectedLayer<__half>::search_algorithm() { +void FullyConnectedLayer<__half>::search_algorithm() { // Set to the CUDA device where this layer assigned to CudaDeviceContext context(get_device_id()); @@ -821,7 +395,7 @@ void Core23TempFullyConnectedLayer<__half>::search_algorithm() { HCTR_LIB_THROW(cudaEventDestroy(stop)); } // namespace HugeCTR -std::unique_ptr Core23TempFullyConnectedLayer<__half>::get_uniform_initializer( +std::unique_ptr FullyConnectedLayer<__half>::get_uniform_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1); @@ -832,8 +406,8 @@ std::unique_ptr Core23TempFullyConnectedLayer<__half>::get_unifor return std::make_unique(-1 * limit, limit); } -std::unique_ptr -Core23TempFullyConnectedLayer<__half>::get_xavier_uniform_initializer(const int index) { +std::unique_ptr FullyConnectedLayer<__half>::get_xavier_uniform_initializer( + const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1); auto top_tensor = this->output_tensors_[0]; @@ -844,7 +418,7 @@ Core23TempFullyConnectedLayer<__half>::get_xavier_uniform_initializer(const int 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFullyConnectedLayer<__half>::get_xavier_norm_initializer( +std::unique_ptr FullyConnectedLayer<__half>::get_xavier_norm_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1); @@ -856,7 +430,7 @@ std::unique_ptr Core23TempFullyConnectedLayer<__half>::get_xavier 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFullyConnectedLayer<__half>::get_default_initializer( +std::unique_ptr FullyConnectedLayer<__half>::get_default_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1); @@ -877,6 +451,6 @@ std::unique_ptr Core23TempFullyConnectedLayer<__half>::get_defaul return simu; } -template class Core23TempFullyConnectedLayer<__half>; +template class FullyConnectedLayer<__half>; } // namespace HugeCTR diff --git a/HugeCTR/src/layers/fused_fully_connected_layer.cu b/HugeCTR/src/layers/fused_fully_connected_layer.cu index 5a36750cb2..a841a2fb20 100644 --- a/HugeCTR/src/layers/fused_fully_connected_layer.cu +++ b/HugeCTR/src/layers/fused_fully_connected_layer.cu @@ -90,329 +90,11 @@ __global__ void reverse_add_bias_and_re_kernel(float* bias, __half* middle, cons } // namespace -FusedFullyConnectedLayer::FusedFullyConnectedLayer( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor, - const std::shared_ptr& gpu_resource, std::vector initializer_types) - : TrainableLayer<__half>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource, - initializer_types), - falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP) { - const auto& bottom_tensor_dim = bottom_tensor.get_dimensions(); - const auto& top_tensor_dim = top_tensor.get_dimensions(); - - if (bottom_tensor_dim.size() != 2 || top_tensor_dim.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions"); - } - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - if (batch_size % 32 != 0 || output_size % 64 != 0) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "The first dimension of bottom tensor must be a multiple of 32, the second dimension " - "of top tensor must be a multiple of 64."); - } - - std::vector kernel_dim = {input_size, output_size}; - std::vector bias_dim = {1, output_size}; - - this->set_weight(0, kernel_dim); - this->set_weight(1, bias_dim); - this->set_wgrad(0, kernel_dim); - this->set_wgrad(1, bias_dim); - - bottom_tensor_ = bottom_tensor; - top_tensor_ = top_tensor; - blobs_buff->reserve(top_tensor_.get_dimensions(), &middle_tensor_); - blobs_buff->reserve(bias_dim, &bias_grad_tensor_); -} - -void FusedFullyConnectedLayer::fprop(bool is_train) { - CudaDeviceContext context(get_device_id()); - - const __half* kernel = this->get_weight(0).get_ptr(); - const __half* bias = this->get_weight(1).get_ptr(); - const __half* bottom = get_bottom_tensor(is_train).get_ptr(); - __half* middle = middle_tensor_.get_ptr(); - __half* top = top_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor(is_train).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - const float alpha = 1.0f; - const float beta = 0.0f; - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size, - bottom, CUDA_R_16F, input_size, &beta, middle, CUDA_R_16F, - output_size, CUDA_R_32F, falgo_k_)); - - const size_t max_threads = 1024; - const size_t blocks = batch_size; - const size_t threads = min(output_size / 2, max_threads); - - add_bias_and_re_kernel<<>>( - top, middle, bias, output_size / 2, output_size / 2); -} - -void FusedFullyConnectedLayer::bprop() { - CudaDeviceContext context(get_device_id()); - - const __half* kernel = this->get_weight(0).get_ptr(); - const __half* top = top_tensor_.get_ptr(); - __half* kernel_grad = this->get_wgrad(0).get_ptr(); - __half* bias_grad = this->get_wgrad(1).get_ptr(); - __half* bottom = get_bottom_tensor(true).get_ptr(); - __half* middle = middle_tensor_.get_ptr(); - float* bias_grad_float = bias_grad_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - int batch_size = bottom_tensor_dim[0]; - int output_size = top_tensor_dim[1]; - int input_size = bottom_tensor_dim[1]; - - const float alpha = 1.0f; - const float beta_k = 1.0f; - const float beta_x = 0.0f; - - initialize_array<<<(output_size - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>( - bias_grad_float, output_size, 0.0f); - - dim3 blocks(output_size / 64, batch_size / 32); - reverse_add_bias_and_re_kernel<32> - <<>>(bias_grad_float, middle, top, output_size / 2); - - convert_array<<<(output_size - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>( - bias_grad, bias_grad_float, output_size); - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, batch_size, &alpha, middle, CUDA_R_16F, output_size, - bottom, CUDA_R_16F, input_size, &beta_k, kernel_grad, CUDA_R_16F, - output_size, CUDA_R_32F, balgo_k_)); - - HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, - middle, CUDA_R_16F, output_size, &beta_x, bottom, CUDA_R_16F, - input_size, CUDA_R_32F, balgo_x_)); -} - -void FusedFullyConnectedLayer::search_algorithm() { - // Set to the CUDA device where this layer assigned to - CudaDeviceContext context(get_device_id()); - - const size_t repeat_num = 100; - - // Device Tensors to be used - __half* bottom = get_bottom_tensor(true).get_ptr(); - __half* top = top_tensor_.get_ptr(); - __half* kernel = this->get_weight(0).get_ptr(); - __half* bias = this->get_weight(1).get_ptr(); - __half* kernel_grad = this->get_wgrad(0).get_ptr(); - __half* bias_grad = this->get_wgrad(1).get_ptr(); - - // Tensor dim - const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - // Record time for each algorithm - float shortestTime = std::numeric_limits::max(); - float time; - cudaEvent_t start, stop; - HCTR_LIB_THROW(cudaEventCreate(&start)); - HCTR_LIB_THROW(cudaEventCreate(&stop)); - - // Start, end for search - const cublasGemmAlgo_t startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; - const cublasGemmAlgo_t endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP; - - // Search all the algorithm for falgo_k_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 1.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, - batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size, bottom, - CUDA_R_16F, input_size, &beta, top, CUDA_R_16F, output_size, CUDA_R_32F, - static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - falgo_k_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_k_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 1.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, batch_size, &alpha, top, CUDA_R_16F, output_size, bottom, - CUDA_R_16F, input_size, &beta, kernel_grad, CUDA_R_16F, output_size, - CUDA_R_32F, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_k_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_x_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, top, - CUDA_R_16F, output_size, &beta, bottom, CUDA_R_16F, input_size, - CUDA_R_32F, static_cast(testAlgo)); - } - - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_x_ = static_cast(testAlgo); - } - } - - // Print selection information - // HCTR_LOG(INFO, WORLD, "The algorithm selection for falgo_k_, balgo_k_, balgo_x_ are: %d, %d and - // %d.\n", - // (int)falgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, - // (int)balgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, - // (int)balgo_x_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - - // Output msg - // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas - // Gemm.\n"); Clean-up - HCTR_LIB_THROW(cudaEventDestroy(start)); - HCTR_LIB_THROW(cudaEventDestroy(stop)); -} // namespace HugeCTR - -std::unique_ptr FusedFullyConnectedLayer::get_uniform_initializer(const int index) { - size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1]; - size_t top_dim = top_tensor_.get_dimensions()[1]; - - float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim); - return std::make_unique(-1 * limit, limit); -} - -std::unique_ptr FusedFullyConnectedLayer::get_xavier_uniform_initializer( - const int index) { - size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1]; - size_t top_dim = top_tensor_.get_dimensions()[1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Uniform, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FusedFullyConnectedLayer::get_xavier_norm_initializer( - const int index) { - size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1]; - size_t top_dim = top_tensor_.get_dimensions()[1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FusedFullyConnectedLayer::get_default_initializer(const int index) { - size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1]; - size_t top_dim = top_tensor_.get_dimensions()[1]; - - std::unique_ptr simu(nullptr); - if (0 == index) { - simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, bottom_dim, top_dim)); - } else if (1 == index) { - float stddev = sqrt(1.f / top_dim); - simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev)); - } else { - HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}."); - } - - return simu; -} - -Core23TempFusedFullyConnectedLayer::Core23TempFusedFullyConnectedLayer( - const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor, - const std::shared_ptr& gpu_resource, std::vector initializer_types) - : Core23TempTrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource, - initializer_types), +FusedFullyConnectedLayer::FusedFullyConnectedLayer(const core23::Tensor& bottom_tensor, + const core23::Tensor& top_tensor, + const std::shared_ptr& gpu_resource, + std::vector initializer_types) + : TrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource, initializer_types), falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP) { @@ -459,7 +141,7 @@ Core23TempFusedFullyConnectedLayer::Core23TempFusedFullyConnectedLayer( .buffer_params(blobs_buffer_params)); } -void Core23TempFusedFullyConnectedLayer::fprop(bool is_train) { +void FusedFullyConnectedLayer::fprop(bool is_train) { CudaDeviceContext context(get_device_id()); const __half* kernel = this->get_weight(0).data<__half>(); @@ -491,7 +173,7 @@ void Core23TempFusedFullyConnectedLayer::fprop(bool is_train) { top, middle, bias, output_size / 2, output_size / 2); } -void Core23TempFusedFullyConnectedLayer::bprop() { +void FusedFullyConnectedLayer::bprop() { CudaDeviceContext context(get_device_id()); const __half* kernel = this->get_weight(0).data<__half>(); @@ -534,7 +216,7 @@ void Core23TempFusedFullyConnectedLayer::bprop() { input_size, CUDA_R_32F, balgo_x_)); } -void Core23TempFusedFullyConnectedLayer::search_algorithm() { +void FusedFullyConnectedLayer::search_algorithm() { // Set to the CUDA device where this layer assigned to CudaDeviceContext context(get_device_id()); @@ -687,8 +369,7 @@ void Core23TempFusedFullyConnectedLayer::search_algorithm() { HCTR_LIB_THROW(cudaEventDestroy(stop)); } // namespace HugeCTR -std::unique_ptr Core23TempFusedFullyConnectedLayer::get_uniform_initializer( - const int index) { +std::unique_ptr FusedFullyConnectedLayer::get_uniform_initializer(const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); @@ -696,7 +377,7 @@ std::unique_ptr Core23TempFusedFullyConnectedLayer::get_uniform_i return std::make_unique(-1 * limit, limit); } -std::unique_ptr Core23TempFusedFullyConnectedLayer::get_xavier_uniform_initializer( +std::unique_ptr FusedFullyConnectedLayer::get_xavier_uniform_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); @@ -706,7 +387,7 @@ std::unique_ptr Core23TempFusedFullyConnectedLayer::get_xavier_un 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFusedFullyConnectedLayer::get_xavier_norm_initializer( +std::unique_ptr FusedFullyConnectedLayer::get_xavier_norm_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); @@ -716,8 +397,7 @@ std::unique_ptr Core23TempFusedFullyConnectedLayer::get_xavier_no 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFusedFullyConnectedLayer::get_default_initializer( - const int index) { +std::unique_ptr FusedFullyConnectedLayer::get_default_initializer(const int index) { int64_t bottom_dim = get_bottom_tensor(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); diff --git a/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu b/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu index 451217a259..c92c5e0cf0 100644 --- a/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu +++ b/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu @@ -52,774 +52,6 @@ __global__ void reverse_relu_kernel_not_aligned(__half* dRelu, __half* mask, con } // namespace FusedReluBiasFullyConnectedLayer::FusedReluBiasFullyConnectedLayer( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& train_in_tensor, const Tensor2<__half>& mask_in_tensor, - const Tensor2<__half>& dRelu_in_tensor, const Tensor2<__half>& db_in_tensor, - const Tensor2<__half>& train_out_tensor, const Tensor2<__half>& mask_out_tensor, - const Tensor2<__half>& dRelu_out_tensor, Tensor2<__half>& db_out_tensor, - const std::shared_ptr& gpu_resource, const FcPosition_t& pos, - const Activation_t& act, const bool& skip_dgrad, std::vector initializer_types, - const bool async_mlp_wgrad, const bool head_mask_in, const bool fuse_wb) - : TrainableLayer<__half>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource, - initializer_types), - balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), - pos_(pos), - act_(act), - skip_dgrad_(skip_dgrad), - async_mlp_wgrad_(async_mlp_wgrad), - head_mask_in_(head_mask_in), - fuse_wb_(fuse_wb), - event_overlap_created_(false) { - const auto& bottom_tensor_dim = train_in_tensor.get_dimensions(); - const auto& top_tensor_dim = train_out_tensor.get_dimensions(); - - if (bottom_tensor_dim.size() != 2 || top_tensor_dim.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions"); - } - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - std::vector kernel_dim = {input_size, output_size}; - std::vector bias_dim = {1, output_size}; - std::vector identity_dim = {1, batch_size}; - - this->set_weight(0, kernel_dim); - weights_half_.push_back(this->get_weight(0)); - this->set_weight(1, bias_dim); - weights_half_.push_back(this->get_weight(1)); - this->set_wgrad(0, kernel_dim); - weights_grad_.push_back(this->get_wgrad(0)); - this->set_wgrad(1, bias_dim); - db_out_tensor = this->get_wgrad(1); - weights_grad_.push_back(this->get_wgrad(1)); - - blobs_buff->reserve(identity_dim, &identity_tensor_); - - train_in_tensor_ = train_in_tensor; - // if (pos_ == FcPosition_t::Head || pos_ == FcPosition_t::Isolated) { - // // mask_in_tensor_ = train_in_tensor; - // } else { - mask_in_tensor_ = mask_in_tensor; - dRelu_in_tensor_ = dRelu_in_tensor; - db_in_tensor_ = db_in_tensor; - // } - train_out_tensor_ = train_out_tensor; - mask_out_tensor_ = mask_out_tensor; - dRelu_out_tensor_ = dRelu_out_tensor; - db_out_tensor_ = db_out_tensor; - blobs_buff->reserve(kernel_dim, &bias_grad_tensor_); - - std::vector mask_dim = {batch_size, output_size}; - blobs_buff->reserve(mask_dim, &mask_in_tensor_temp_); - - if (async_mlp_wgrad_) - cublas_handle_wgrad_ = gpu_resource->get_cublas_handle_wgrad(); - else - cublas_handle_wgrad_ = gpu_resource->get_cublas_handle(); -} - -void FusedReluBiasFullyConnectedLayer::initialize() { - CudaDeviceContext context(get_device_id()); - HCTR_LIB_THROW(cudaEventCreate(&event_overlap_)); - event_overlap_created_ = true; - - // TODO: We need different bottom desc based on is_train or not - const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions(); - const auto& top_tensor_dim = train_out_tensor_.get_dimensions(); - __half* identity = identity_tensor_.get_ptr(); - - int batch_size = bottom_tensor_dim[0]; - int output_size = top_tensor_dim[1]; - int input_size = bottom_tensor_dim[1]; - - initialize_array<<<(batch_size - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>( - identity, batch_size, __float2half(1.0f)); - - HCTR_LIB_THROW(cublasLtMatmulDescCreate(&cublas_op_desc_, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t trans = CUBLAS_OP_N; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_TRANSA, - &trans, sizeof(trans))); - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_TRANSB, - &trans, sizeof(trans))); - cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_RELU_AUX_BIAS; - if (act_ == Activation_t::None) epi = CUBLASLT_EPILOGUE_BIAS; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_EPILOGUE, - &epi, sizeof(epi))); - const __half* bias = weights_half_[1].get_ptr(); - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_BIAS_POINTER, - &bias, sizeof(bias))); - if (act_ != Activation_t::None) { - __half* reluMask = mask_out_tensor_.get_ptr(); - cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, - &reluMask, sizeof(reluMask)); - long reluMaskLd = output_size; - cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, - &reluMaskLd, sizeof(reluMaskLd)); - } - - HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_kernel_desc_, CUDA_R_16F, output_size, - input_size, output_size)); - HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_bottom_desc_, CUDA_R_16F, input_size, - batch_size, input_size)); - HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_top_desc_, CUDA_R_16F, output_size, batch_size, - output_size)); - - HCTR_LIB_THROW(cublasLtMatmulPreferenceCreate(&cublas_preference_)); - - cublaslt_workspace_size_ = 1024 * 1024 * 8; // Set it to 8MB for now - HCTR_LIB_THROW(cudaMalloc(&cublaslt_workspace_, cublaslt_workspace_size_)); - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute( - cublas_preference_, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size_, - sizeof(cublaslt_workspace_size_))); - - uint32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, - &pointer_mode, sizeof(pointer_mode))); - -#if CUBLAS_VERSION < 120000 - pointer_mode = CUBLASLT_POINTER_MODE_MASK_HOST; - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(cublas_preference_, - CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK, - &pointer_mode, sizeof(pointer_mode))); - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute( - cublas_preference_, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &epi, sizeof(epi))); -#endif - - // By default set algo to best estimated heurstic - cublasLtMatmulHeuristicResult_t heuristic_result; - int returned_res = 0; - HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic( - get_gpu().get_cublaslt_handle(), cublas_op_desc_, cublas_kernel_desc_, cublas_bottom_desc_, - cublas_top_desc_, cublas_top_desc_, cublas_preference_, 1, &heuristic_result, &returned_res)); - - memcpy(&falgo_k_, &heuristic_result.algo, sizeof(falgo_k_)); - - if (returned_res == 0) { - HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED); - } - - initialize_dgrad(); - initialize_wgrad(); -} - -void FusedReluBiasFullyConnectedLayer::initialize_dgrad() { - // TODO: We need different bottom desc based on is_train or not - const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions(); - const auto& top_tensor_dim = train_out_tensor_.get_dimensions(); - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - HCTR_LIB_THROW(cublasLtMatmulDescCreate(&cublas_op_desc_bprop_, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transA = CUBLAS_OP_T; - cublasOperation_t transB = CUBLAS_OP_N; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_TRANSA, - &transA, sizeof(transA))); - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_TRANSB, - &transB, sizeof(transB))); - cublasLtEpilogue_t epi; - - if (pos_ == FcPosition_t::Head || pos_ == FcPosition_t::Isolated) { - epi = CUBLASLT_EPILOGUE_DEFAULT; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute( - cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(epi))); - } else if (pos_ == FcPosition_t::Body || pos_ == FcPosition_t::Tail) { - epi = fuse_wb_ ? CUBLASLT_EPILOGUE_DRELU : CUBLASLT_EPILOGUE_DRELU_BGRAD; - cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, - sizeof(epi)); - if (!fuse_wb_) { - __half* bgrad = db_in_tensor_.get_ptr(); - cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_BIAS_POINTER, - &bgrad, sizeof(bgrad)); - } - __half* reluMask = mask_in_tensor_.get_ptr(); - cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, - &reluMask, sizeof(reluMask)); - long reluMaskLd = input_size; - cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, - &reluMaskLd, sizeof(reluMaskLd)); - } - - HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_dRelu_top_desc_, CUDA_R_16F, output_size, - batch_size, output_size)); - HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_dRelu_bottom_desc_, CUDA_R_16F, input_size, - batch_size, input_size)); - - HCTR_LIB_THROW(cublasLtMatmulPreferenceCreate(&cublas_preference_dRelu_)); - - cublaslt_workspace_size_ = 1024 * 1024 * 8; // Set it to 8MB for now - HCTR_LIB_THROW(cudaMalloc(&cublaslt_workspace_dRelu_, cublaslt_workspace_size_)); - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute( - cublas_preference_dRelu_, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size_, - sizeof(cublaslt_workspace_size_))); - - uint32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, - CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, - sizeof(pointer_mode))); - -#if CUBLAS_VERSION < 120000 - pointer_mode = CUBLASLT_POINTER_MODE_MASK_HOST; - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(cublas_preference_dRelu_, - CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK, - &pointer_mode, sizeof(pointer_mode))); - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute( - cublas_preference_dRelu_, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &epi, sizeof(epi))); -#endif - - // By default set algo to best estimated heurstic - cublasLtMatmulHeuristicResult_t heuristic_result; - int returned_res = 0; - HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic( - get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, cublas_kernel_desc_, - cublas_dRelu_top_desc_, cublas_dRelu_bottom_desc_, cublas_dRelu_bottom_desc_, - cublas_preference_dRelu_, 1, &heuristic_result, &returned_res)); - - memcpy(&balgo_dRelu_, &heuristic_result.algo, sizeof(balgo_dRelu_)); - - if (returned_res == 0) { - HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED); - } -} - -void FusedReluBiasFullyConnectedLayer::initialize_wgrad() { - // TODO: We need different bottom desc based on is_train or not - const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions(); - const auto& top_tensor_dim = train_out_tensor_.get_dimensions(); - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - HCTR_LIB_THROW(cublasLtMatmulDescCreate(&cublas_op_desc_wgrad_, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transA = CUBLAS_OP_N; - cublasOperation_t transB = CUBLAS_OP_T; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, CUBLASLT_MATMUL_DESC_TRANSA, - &transA, sizeof(transA))); - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, CUBLASLT_MATMUL_DESC_TRANSB, - &transB, sizeof(transB))); - cublasLtEpilogue_t epi; - if (fuse_wb_ || pos_ == FcPosition_t::Tail || pos_ == FcPosition_t::Isolated) { - epi = CUBLASLT_EPILOGUE_BGRADA; - __half* bgrad = db_out_tensor_.get_ptr(); - cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, - sizeof(bgrad)); - } else { - epi = CUBLASLT_EPILOGUE_DEFAULT; - } - - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, - CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(epi))); - - HCTR_LIB_THROW(cublasLtMatmulPreferenceCreate(&cublas_preference_wgrad_)); - - cublaslt_workspace_size_ = 1024 * 1024 * 8; // Set it to 8MB for now - HCTR_LIB_THROW(cudaMalloc(&cublaslt_workspace_wgrad_, cublaslt_workspace_size_)); - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute( - cublas_preference_wgrad_, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size_, - sizeof(cublaslt_workspace_size_))); - - uint32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST; - HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, - CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, - sizeof(pointer_mode))); - -#if CUBLAS_VERSION < 120000 - pointer_mode = CUBLASLT_POINTER_MODE_MASK_HOST; - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(cublas_preference_wgrad_, - CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK, - &pointer_mode, sizeof(pointer_mode))); - HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute( - cublas_preference_wgrad_, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &epi, sizeof(epi))); -#endif - - // By default set algo to best estimated heurstic - cublasLtMatmulHeuristicResult_t heuristic_result; - int returned_res = 0; - HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic( - get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, cublas_dRelu_top_desc_, - cublas_dRelu_bottom_desc_, cublas_kernel_desc_, cublas_kernel_desc_, cublas_preference_wgrad_, - 1, &heuristic_result, &returned_res)); - memcpy(&balgo_wgrad_, &heuristic_result.algo, sizeof(balgo_wgrad_)); - // returned_res is 0 indicates that there is no feasible algorithm. - if (returned_res == 0) { - HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED); - } -} - -void FusedReluBiasFullyConnectedLayer::fprop(bool is_train) { - CudaDeviceContext context(get_device_id()); - - const __half* kernel = weights_half_[0].get_ptr(); - const __half* bias = weights_half_[1].get_ptr(); - const __half* bottom = get_bottom_tensor_fprop(is_train).get_ptr(); - __half* top_fprop = train_out_tensor_.get_ptr(); - __half* mask_out = mask_out_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor_fprop(is_train).get_dimensions(); - const auto& top_tensor_dim = train_out_tensor_.get_dimensions(); - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - const float alpha = 1.0f; - const float beta = 0.0f; - - HCTR_LIB_THROW(cublasLtMatmul( - get_gpu().get_cublaslt_handle(), cublas_op_desc_, &alpha, kernel, cublas_kernel_desc_, bottom, - cublas_bottom_desc_, &beta, top_fprop, cublas_top_desc_, top_fprop, cublas_top_desc_, - &falgo_k_, cublaslt_workspace_, cublaslt_workspace_size_, get_gpu().get_stream())); - - if ((pos_ == FcPosition_t::Tail || pos_ == FcPosition_t::Isolated) && - act_ != Activation_t::None) { - size_t len = train_out_tensor_.get_num_elements(); - HCTR_LIB_THROW(cudaMemcpyAsync(mask_out, top_fprop, len * sizeof(__half), - cudaMemcpyDeviceToDevice, get_gpu().get_stream())); - } -} - -void FusedReluBiasFullyConnectedLayer::bprop() { - CudaDeviceContext context(get_device_id()); - - const __half* kernel = weights_half_[0].get_ptr(); - const __half* train_out = train_out_tensor_.get_ptr(); - __half* mask_out = mask_out_tensor_.get_ptr(); - __half* kernel_grad = weights_grad_[0].get_ptr(); - __half* bias_grad = weights_grad_[1].get_ptr(); - __half* bottom = get_bottom_tensor_fprop(true).get_ptr(); - //__half* bottom_bprop = get_bottom_tensor_bprop(true).get_ptr(); - float* bias_grad_float = bias_grad_tensor_.get_ptr(); - __half* dRelu_top = dRelu_out_tensor_.get_ptr(); - const __half* identity = identity_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions(); - const auto& top_tensor_dim = train_out_tensor_.get_dimensions(); - - size_t batch_size = bottom_tensor_dim[0]; - size_t output_size = top_tensor_dim[1]; - size_t input_size = bottom_tensor_dim[1]; - - const float alpha = 1.0f; - const float beta_k = 1.0f; - const float beta_x = 0.0f; - const float beta_b = 0.0f; - - // dRelu - if (pos_ == FcPosition_t::Tail || pos_ == FcPosition_t::Isolated) { - if (act_ != Activation_t::None) { - if ((batch_size * output_size) % 4 == 0) { - reverse_relu_kernel<<<(batch_size * output_size / 4 - 1) / 1024 + 1, 1024, 0, - get_gpu().get_stream()>>>(dRelu_top, mask_out, train_out, - batch_size * output_size); - } else - reverse_relu_kernel_not_aligned<<<(batch_size * output_size - 1) / 1024 + 1, 1024, 0, - get_gpu().get_stream()>>>(dRelu_top, mask_out, train_out, - batch_size * output_size); - } else - dRelu_top = train_out_tensor_.get_ptr(); - } - - // wait for dRelu - if (async_mlp_wgrad_) { - HCTR_LIB_THROW(cudaEventRecord(event_overlap_, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaStreamWaitEvent(get_gpu().get_comp_overlap_stream(), event_overlap_)); - } - - // bgrad+wgrad - HCTR_LIB_THROW(cublasLtMatmul( - get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, &alpha, dRelu_top, - cublas_dRelu_top_desc_, bottom, cublas_dRelu_bottom_desc_, &beta_k, kernel_grad, - cublas_kernel_desc_, kernel_grad, cublas_kernel_desc_, &balgo_wgrad_, - cublaslt_workspace_wgrad_, cublaslt_workspace_size_, - async_mlp_wgrad_ ? get_gpu().get_comp_overlap_stream() : get_gpu().get_stream())); - - // dgrad - if (!skip_dgrad_) { - __half* bottom_bprop; - if (head_mask_in_) { - bottom_bprop = mask_in_tensor_.get_ptr(); - } else { - bottom_bprop = train_in_tensor_.get_ptr(); - } - - if (pos_ == FcPosition_t::Body || pos_ == FcPosition_t::Tail) { - bottom_bprop = dRelu_in_tensor_.get_ptr(); - } - HCTR_LIB_THROW(cublasLtMatmul( - get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, &alpha, kernel, cublas_kernel_desc_, - dRelu_top, cublas_dRelu_top_desc_, &beta_x, bottom_bprop, cublas_dRelu_bottom_desc_, - bottom_bprop, cublas_dRelu_bottom_desc_, &balgo_dRelu_, cublaslt_workspace_dRelu_, - cublaslt_workspace_size_, get_gpu().get_stream())); - } - - if (async_mlp_wgrad_ && pos_ == FcPosition_t::Head) { - HCTR_LIB_THROW(cudaEventRecord(event_overlap_, this->get_gpu().get_comp_overlap_stream())); - HCTR_LIB_THROW(cudaStreamWaitEvent(this->get_gpu().get_stream(), event_overlap_)); - } -} - -void FusedReluBiasFullyConnectedLayer::search_algorithm() { - // Set to the CUDA device where this layer assigned to - CudaDeviceContext context(get_device_id()); - const size_t repeat_num = 100; - const int max_algo_count = 16; - - // Device Tensors to be used - __half* bottom = get_bottom_tensor_fprop(true).get_ptr(); - __half* top = train_out_tensor_.get_ptr(); - __half* kernel = weights_half_[0].get_ptr(); - __half* bias = weights_half_[1].get_ptr(); - __half* kernel_grad = weights_grad_[0].get_ptr(); - __half* bias_grad = weights_grad_[1].get_ptr(); - __half* identity = identity_tensor_.get_ptr(); - - // Tensor dim - const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions(); - const auto& top_tensor_dim = train_out_tensor_.get_dimensions(); - - int batch_size = bottom_tensor_dim[0]; - int output_size = top_tensor_dim[1]; - int input_size = bottom_tensor_dim[1]; - - // Record time for each algorithm - float shortestTime = std::numeric_limits::max(); - float time; - cudaEvent_t start, stop; - HCTR_LIB_THROW(cudaEventCreate(&start)); - HCTR_LIB_THROW(cudaEventCreate(&stop)); - - cublasLtMatmulHeuristicResult_t heuristic_result[max_algo_count] = {0}; - int algo_count = 0; - HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic( - get_gpu().get_cublaslt_handle(), cublas_op_desc_, cublas_kernel_desc_, cublas_bottom_desc_, - cublas_top_desc_, cublas_top_desc_, cublas_preference_, max_algo_count, heuristic_result, - &algo_count)); - - if (algo_count == 0) { - HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED); - } - - for (int algoIdx = 0; algoIdx < algo_count; algoIdx++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = - cublasLtMatmul(get_gpu().get_cublaslt_handle(), cublas_op_desc_, &alpha, kernel, - cublas_kernel_desc_, bottom, cublas_bottom_desc_, &beta, top, - cublas_top_desc_, top, cublas_top_desc_, &heuristic_result[algoIdx].algo, - cublaslt_workspace_, cublaslt_workspace_size_, get_gpu().get_stream()); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n", - // testAlgo); - continue; - } - - // if(get_device_id()==0) HCTR_LOG(INFO, WORLD, "Algo: %d, wavesCount: %f, time: %f\n", - // (int)heuristic_result[algoIdx].algo, - // heuristic_result[algoIdx].wavesCount, - // time); - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - memcpy(&falgo_k_, &heuristic_result[algoIdx].algo, sizeof(falgo_k_)); - // if(get_device_id()==0) HCTR_LOG(INFO, WORLD, "Picked algorithm: %d", - // heuristic_result[algoIdx].algo); - } - } - - // dRelu in backward pass - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - cublasLtMatmulHeuristicResult_t heuristic_result_dRelu[max_algo_count] = {0}; - int algo_count_dRelu = 0; - HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic( - get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, cublas_kernel_desc_, - cublas_dRelu_top_desc_, cublas_dRelu_bottom_desc_, cublas_dRelu_bottom_desc_, - cublas_preference_dRelu_, max_algo_count, heuristic_result_dRelu, &algo_count_dRelu)); - - if (algo_count_dRelu == 0) { - HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED); - } - - for (int algoIdx = 0; algoIdx < algo_count_dRelu; algoIdx++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasLtMatmul(get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, &alpha, - kernel, cublas_kernel_desc_, top, cublas_dRelu_top_desc_, &beta, - bottom, cublas_dRelu_bottom_desc_, bottom, cublas_dRelu_bottom_desc_, - &heuristic_result_dRelu[algoIdx].algo, cublaslt_workspace_dRelu_, - cublaslt_workspace_size_, get_gpu().get_stream()); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - memcpy(&balgo_dRelu_, &heuristic_result_dRelu[algoIdx].algo, sizeof(balgo_dRelu_)); - } - } - - // wgrad in backward pass - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - cublasLtMatmulHeuristicResult_t heuristic_result_wgrad[max_algo_count] = {0}; - int algo_count_wgrad = 0; - HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic( - get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, cublas_dRelu_top_desc_, - cublas_dRelu_bottom_desc_, cublas_kernel_desc_, cublas_kernel_desc_, cublas_preference_wgrad_, - max_algo_count, heuristic_result_wgrad, &algo_count_wgrad)); - - if (algo_count_wgrad == 0) { - HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED); - } - - for (int algoIdx = 0; algoIdx < algo_count_wgrad; algoIdx++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 1.0f; - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasLtMatmul(get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, &alpha, top, - cublas_dRelu_top_desc_, bottom, cublas_dRelu_bottom_desc_, &beta, - kernel, cublas_kernel_desc_, kernel, cublas_kernel_desc_, - &heuristic_result_wgrad[algoIdx].algo, cublaslt_workspace_wgrad_, - cublaslt_workspace_size_, get_gpu().get_stream()); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // HCTR_LOG(INFO, WORLD, "algoIdx: %d, time: %f, shortest time: %f\n", algoIdx, time, - // shortestTime); Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - // HCTR_LOG(INFO, WORLD, "wgrad cublasMatmul algoIdx: %d, time: %f\n", algoIdx, shortestTime); - memcpy(&balgo_wgrad_, &heuristic_result_wgrad[algoIdx].algo, sizeof(balgo_wgrad_)); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Start, end for search - const cublasGemmAlgo_t startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; - const cublasGemmAlgo_t endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP; - - // Search all the algorithm for balgo_k_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 1.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size, - input_size, batch_size, &alpha, top, CUDA_R_16F, output_size, bottom, - CUDA_R_16F, input_size, &beta, kernel_grad, CUDA_R_16F, output_size, - CUDA_R_32F, static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - // HCTR_LOG(INFO, WORLD, "wgrad cublasGemmEx algoIdx: %d, time: %f\n", testAlgo, - // shortestTime); - balgo_k_ = static_cast(testAlgo); - } - } - - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_b_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const float alpha = 1.0f; - const float beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, 1, - batch_size, &alpha, top, CUDA_R_16F, output_size, identity, CUDA_R_16F, - batch_size, &beta, bias_grad, CUDA_R_16F, output_size, CUDA_R_32F, - static_cast(testAlgo)); - } - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_b_ = static_cast(testAlgo); - } - } - // Reset shortestTime - shortestTime = std::numeric_limits::max(); - - // Search all the algorithm for balgo_x_ - for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) { - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - const __half alpha = 1.0f; - const __half beta = 0.0f; - - // Record start event - HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream())); - for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) { - status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size, - batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, top, - CUDA_R_16F, output_size, &beta, bottom, CUDA_R_16F, input_size, - CUDA_R_32F, static_cast(testAlgo)); - } - - HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream())); - HCTR_LIB_THROW(cudaEventSynchronize(stop)); - HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop)); - // Avg Time(ms) for this algorithm for fprop GEMM - time = time / repeat_num; - // Skip if the algorithm is supported for fprop configuration - if (status != CUBLAS_STATUS_SUCCESS) { - // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n", - // testAlgo); - continue; - } - // Record the optimal time and algorithm - if (time < shortestTime) { - shortestTime = time; - balgo_x_ = static_cast(testAlgo); - } - } - - // Print selection information - // HCTR_LOG(INFO, WORLD, "The algorithm selection for falgo_k_, balgo_k_, balgo_x_ are: %d, %d and - // %d.\n", - // (int)falgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, - // (int)balgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, - // (int)balgo_x_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP); - - // Output msg - // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas - // Gemm.\n"); Clean-up - HCTR_LIB_THROW(cudaEventDestroy(start)); - HCTR_LIB_THROW(cudaEventDestroy(stop)); -} // namespace HugeCTR - -std::unique_ptr FusedReluBiasFullyConnectedLayer::get_uniform_initializer( - const int index) { - size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1]; - size_t top_dim = train_out_tensor_.get_dimensions()[1]; - - float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim); - return std::make_unique(-1 * limit, limit); -} - -std::unique_ptr FusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer( - const int index) { - size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1]; - size_t top_dim = train_out_tensor_.get_dimensions()[1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Uniform, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer( - const int index) { - size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1]; - size_t top_dim = train_out_tensor_.get_dimensions()[1]; - - return std::make_unique(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, - 0 == index ? bottom_dim : 0, top_dim); -} - -std::unique_ptr FusedReluBiasFullyConnectedLayer::get_default_initializer( - const int index) { - size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1]; - size_t top_dim = train_out_tensor_.get_dimensions()[1]; - - std::unique_ptr simu(nullptr); - if (0 == index) { - simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg, - data_simu::Distribution_t::Norm, bottom_dim, top_dim)); - } else if (1 == index) { - float stddev = sqrt(1.f / top_dim); - simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev)); - } else { - HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}."); - } - - return simu; -} - -Core23TempFusedReluBiasFullyConnectedLayer::Core23TempFusedReluBiasFullyConnectedLayer( const core23::Tensor& train_in_tensor, const core23::Tensor& mask_in_tensor, const core23::Tensor& dRelu_in_tensor, const core23::Tensor& db_in_tensor, const core23::Tensor& train_out_tensor, const core23::Tensor& mask_out_tensor, @@ -827,8 +59,8 @@ Core23TempFusedReluBiasFullyConnectedLayer::Core23TempFusedReluBiasFullyConnecte const std::shared_ptr& gpu_resource, const FcPosition_t& pos, const Activation_t& act, const bool& skip_dgrad, std::vector initializer_types, const bool async_mlp_wgrad, const bool head_mask_in, const bool fuse_wb) - : Core23TempTrainableLayer<__half>({train_in_tensor}, {train_out_tensor}, gpu_resource, - initializer_types), + : TrainableLayer<__half>({train_in_tensor}, {train_out_tensor}, gpu_resource, + initializer_types), balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP), @@ -891,7 +123,7 @@ Core23TempFusedReluBiasFullyConnectedLayer::Core23TempFusedReluBiasFullyConnecte cublas_handle_wgrad_ = gpu_resource->get_cublas_handle(); } -void Core23TempFusedReluBiasFullyConnectedLayer::initialize() { +void FusedReluBiasFullyConnectedLayer::initialize() { CudaDeviceContext context(get_device_id()); HCTR_LIB_THROW(cudaEventCreate(&event_overlap_)); event_overlap_created_ = true; @@ -976,7 +208,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::initialize() { initialize_wgrad(); } -void Core23TempFusedReluBiasFullyConnectedLayer::initialize_dgrad() { +void FusedReluBiasFullyConnectedLayer::initialize_dgrad() { // TODO: We need different bottom desc based on is_train or not const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).shape(); const auto& top_tensor_dim = this->output_tensors_[0].shape(); @@ -1058,7 +290,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::initialize_dgrad() { } } -void Core23TempFusedReluBiasFullyConnectedLayer::initialize_wgrad() { +void FusedReluBiasFullyConnectedLayer::initialize_wgrad() { // TODO: We need different bottom desc based on is_train or not const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).shape(); const auto& top_tensor_dim = this->output_tensors_[0].shape(); @@ -1123,7 +355,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::initialize_wgrad() { } } -void Core23TempFusedReluBiasFullyConnectedLayer::fprop(bool is_train) { +void FusedReluBiasFullyConnectedLayer::fprop(bool is_train) { CudaDeviceContext context(get_device_id()); const __half* kernel = weights_half_[0].data<__half>(); @@ -1155,7 +387,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::fprop(bool is_train) { } } -void Core23TempFusedReluBiasFullyConnectedLayer::bprop() { +void FusedReluBiasFullyConnectedLayer::bprop() { CudaDeviceContext context(get_device_id()); const __half* kernel = weights_half_[0].data<__half>(); @@ -1235,7 +467,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::bprop() { } } -void Core23TempFusedReluBiasFullyConnectedLayer::search_algorithm() { +void FusedReluBiasFullyConnectedLayer::search_algorithm() { // Set to the CUDA device where this layer assigned to CudaDeviceContext context(get_device_id()); const int64_t repeat_num = 100; @@ -1537,7 +769,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::search_algorithm() { HCTR_LIB_THROW(cudaEventDestroy(stop)); } // namespace HugeCTR -std::unique_ptr Core23TempFusedReluBiasFullyConnectedLayer::get_uniform_initializer( +std::unique_ptr FusedReluBiasFullyConnectedLayer::get_uniform_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); @@ -1546,8 +778,8 @@ std::unique_ptr Core23TempFusedReluBiasFullyConnectedLayer::get_u return std::make_unique(-1 * limit, limit); } -std::unique_ptr -Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer(const int index) { +std::unique_ptr FusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer( + const int index) { int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); @@ -1556,8 +788,8 @@ Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer(const 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr -Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer(const int index) { +std::unique_ptr FusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer( + const int index) { int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); @@ -1566,7 +798,7 @@ Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer(const in 0 == index ? bottom_dim : 0, top_dim); } -std::unique_ptr Core23TempFusedReluBiasFullyConnectedLayer::get_default_initializer( +std::unique_ptr FusedReluBiasFullyConnectedLayer::get_default_initializer( const int index) { int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1); int64_t top_dim = this->output_tensors_[0].shape().size(1); diff --git a/HugeCTR/src/layers/gru_layer.cu b/HugeCTR/src/layers/gru_layer.cu index e06d69482a..cb7bb9eb21 100644 --- a/HugeCTR/src/layers/gru_layer.cu +++ b/HugeCTR/src/layers/gru_layer.cu @@ -31,386 +31,11 @@ namespace HugeCTR { template -GRULayer::GRULayer(const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, size_t hiddenSize, - size_t batch_size, size_t SeqLength, size_t embedding_vec_size, - const std::shared_ptr& gpu_resource, +GRULayer::GRULayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, + int64_t hiddenSize, int64_t batch_size, int64_t SeqLength, + int64_t embedding_vec_size, const std::shared_ptr& gpu_resource, std::vector initializer_types) - : Layer(gpu_resource, initializer_types) { - try { - CudaDeviceContext context(this->get_device_id()); - // check the in_tensor and out_tensor - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - - // 2. dim match? - // seqLength = in_tensor_dim[1]; - // m = out_tensor_dim[1]; - // miniBatch = in_tensor_dim[0]; - // HCTR_LOG(INFO, WORLD, "m %lu n %lu k %lu \n ", m, n,k); - hiddenSize_ = hiddenSize; - miniBatch = batch_size; - seqLength_ = SeqLength; - embedding_vec_size_ = embedding_vec_size; - - inputTensorSize = miniBatch * seqLength_ * embedding_vec_size_; - outputTensorSize = miniBatch * seqLength_ * hiddenSize_; - hiddenTensorSize = miniBatch * hiddenSize_; - - // weightSpaceSize = m*k + m*m + 1*m; //include W, U weight matrixs and bias vector. - - // HCTR_LIB_THROW(cudnnSetTensor4dDescriptorEx(hDesc, data_type, n, 1, 1, n, - // n, 1, 1, 1)); - - // HCTR_LIB_THROW(cudnnSetTensor4dDescriptorEx(cDesc, data_type, 1, n, m, n, - // n, 1, 1, 1)); - seqLengthArray = new int[miniBatch]; - - for (size_t i = 0; i < miniBatch; i++) { - seqLengthArray[i] = seqLength_; - } - - // cudnnHandle= get_gpu().get_cudnn_handle(); - HCTR_LIB_THROW(cudnnCreate(&cudnnHandle)); - data_type = CudnnDataType::getType(); - HCTR_LIB_THROW(cudnnCreateRNNDescriptor(&rnnDesc)); - HCTR_LIB_THROW(cudnnCreateRNNDataDescriptor(&in_Desc)); - HCTR_LIB_THROW(cudnnCreateRNNDataDescriptor(&out_Desc)); - HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&cDesc)); - HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&hDesc)); - HCTR_LIB_THROW(cudnnCreateDropoutDescriptor(&dropoutDesc)); - - HCTR_LIB_THROW(cudnnSetRNNDataDescriptor( - in_Desc, // cudnnRNNDataDescriptor_t RNNDataDesc, - data_type, // cudnnDataType_t dataType, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, - // //cudnnRNNDataLayout_t layout, - seqLength_, // int maxSeqLength, - miniBatch, // int batchSize, - embedding_vec_size_, // int vectorSize, - seqLengthArray, // const int seqLengthArray[], - NULL // void *paddingFill - )); - - HCTR_LIB_THROW(cudnnSetRNNDataDescriptor( - out_Desc, // cudnnRNNDataDescriptor_t RNNDataDesc, - data_type, // cudnnDataType_t dataType, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, - // //cudnnRNNDataLayout_t layout, - seqLength_, // int maxSeqLength, - miniBatch, // int batchSize, - hiddenSize_, // int vectorSize, - seqLengthArray, // const int seqLengthArray[], - NULL // void *paddingFill - )); - dimHidden[0] = 1 * 1; - dimHidden[1] = miniBatch; - dimHidden[2] = hiddenSize_; - strideHidden[0] = dimHidden[1] * dimHidden[2]; - strideHidden[1] = dimHidden[2]; - strideHidden[2] = 1; - HCTR_LIB_THROW(cudnnSetTensorNdDescriptor(hDesc, data_type, 3, dimHidden, strideHidden)); - HCTR_LIB_THROW(cudnnSetTensorNdDescriptor(cDesc, data_type, 3, dimHidden, strideHidden)); - - HCTR_LIB_THROW(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize)); - HCTR_LIB_THROW(cudaMalloc(&states, stateSize)); - seed = 0; // 1337ull; - HCTR_LIB_THROW( - cudnnSetDropoutDescriptor(dropoutDesc, cudnnHandle, dropout, states, stateSize, seed)); - - HCTR_LIB_THROW(cudnnSetRNNDescriptor_v8( - rnnDesc, - CUDNN_RNN_ALGO_STANDARD, // cudnnRNNAlgo_t algo, - CUDNN_GRU, // cudnnRNNMode_t cellMode, - CUDNN_RNN_SINGLE_INP_BIAS, // cudnnRNNBiasMode_t biasMode, - CUDNN_UNIDIRECTIONAL, // cudnnDirectionMode_t dirMode, - CUDNN_LINEAR_INPUT, // CUDNN_SKIP_INPUT, //CUDNN_LINEAR_INPUT, //cudnnRNNInputMode_t - // inputMode, CUDNN_SKIP_INPUT :without multiplying input by the weight - // matrix - data_type, // cudnnDataType_t dataType, - data_type, // cudnnDataType_t mathPrec, - CUDNN_TENSOR_OP_MATH, // CUDNN_DEFAULT_MATH , //cudnnMathType_t mathType, - embedding_vec_size_, // int32_t embedding_vec_size, When the inputMode=CUDNN_SKIP_INPUT, - // the embedding_vec_size should match the hiddenSize value - hiddenSize_, // int32_t hiddenSize, - hiddenSize_, // int32_t projSize, - 1, // int32_t numLayers, BIDIRECTIONAL=2 - dropoutDesc, // cudnnDropoutDescriptor_t dropoutDesc, - CUDNN_RNN_PADDED_IO_DISABLED // uint32_t auxFlags - )); - - // const int seqLengthArray[in_tensor_dim[0]] = { [0...10] = int(in_tensor_dim[1]) }; - // const int seqLengthArray[m] ={n,n....n}; - // for(int i=0; i weight_dim = {weightSpaceSize/sizeof(T), 1}; - // std::vector dx_dim = {inputTensorSize, 1}; - // std::vector dy_dim = {outputTensorSize, 1}; - // std::vector dhx_dim = {hiddenTensorSize, 1}; - // std::vector dhy_dim = {hiddenTensorSize, 1}; - // std::vector dcx_dim = {hiddenTensorSize, 1}; - // std::vector dcy_dim = {hiddenTensorSize, 1}; - - std::vector weight_dim = {1, weightSpaceSize / sizeof(T)}; - std::vector hx_dim = {1, hiddenTensorSize}; - std::vector dx_dim = {1, inputTensorSize}; - std::vector dy_dim = {1, outputTensorSize}; - std::vector dhx_dim = {1, hiddenTensorSize}; - std::vector dhy_dim = {1, hiddenTensorSize}; - std::vector dweigths_dim = {1, weightSpaceSize / sizeof(T)}; - // HCTR_LOG(INFO, WORLD, "weighsize %zu\n", weightSpaceSize/sizeof(T)); - - { - Tensor2 tensor; - weight_buff->reserve(weight_dim, &tensor); - weights_.push_back(tensor); - } - - { - Tensor2 tensor; - weight_buff->reserve(hx_dim, &tensor); - weights_.push_back(tensor); - } - - { - Tensor2 tensor; - wgrad_buff->reserve(dx_dim, &tensor); - wgrad_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(dy_dim, &tensor); - wgrad_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(dhx_dim, &tensor); - wgrad_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(dhy_dim, &tensor); - wgrad_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(dweigths_dim, &tensor); - wgrad_.push_back(tensor); - } - - HCTR_LIB_THROW(cudaMalloc((void**)&devSeqLengthArray, miniBatch * sizeof(int))); - HCTR_LIB_THROW(cudaMemcpy(devSeqLengthArray, seqLengthArray, miniBatch * sizeof(int), - cudaMemcpyHostToDevice)); - HCTR_LIB_THROW(cudaMalloc((void**)&weightSpace, weightSpaceSize)); - HCTR_LIB_THROW(cudaMalloc((void**)&workSpace, workSpaceSize)); - HCTR_LIB_THROW(cudaMalloc((void**)&reserveSpace, reserveSpaceSize)); - // HCTR_LIB_THROW(cudaMalloc((void **)&dweightSpace, weightSpaceSize)); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - // Where should we create this cuBLAS handle? - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -//#define KERAS_CHECK -template -void GRULayer::fprop(bool is_train) { - CudaDeviceContext context(get_device_id()); - - Tensor2& in_tensor = get_in_tensors(is_train)[0]; - Tensor2& out_tensor = out_tensors_[0]; - - T* weight = weights_[0].get_ptr(); - T* hx = weights_[1].get_ptr(); - // T* Uweight = weights_[1].get_ptr(); - // T* bias = weights_[2].get_ptr(); - - T* in = in_tensor.get_ptr(); - T* out = out_tensor.get_ptr(); -// T* hx = weights_[0].get_ptr(); -// HCTR_LOG(INFO, WORLD, "datatype %lu\n", sizeof(data_type)); -// HCTR_LIB_THROW(cudaMalloc((void **)&in, inputTensorSize * sizeof(T))); - -// HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, -// &alpha, weight, CUDA_R_32F, n, in, CUDA_R_32F, k, &beta, out, -// CUDA_R_32F, n, CUDA_R_32F, falgo_)); -#ifdef KERAS_CHECK - cudnnTensorDescriptor_t wDesc; - cudnnTensorDescriptor_t bDesc; - HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&wDesc)); - HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&bDesc)); - - // Tensor2 linLayerMat; - // Tensor2 linLayerBias; - numLinearLayers = 6; // cellMode == CUDNN_GRU - for (int linLayerID = 0; linLayerID < numLinearLayers; linLayerID++) { - T* linLayerMat = NULL; - T* linLayerBias = NULL; - int nbDims = 0; - int dim[3] = {0, 0, 0}, stride[3]; - int layer = 0; - // HCTR_LOG(INFO, WORLD, "weightSpaceSize %zu\n", weightSpaceSize); - HCTR_LIB_THROW(cudnnGetRNNWeightParams(cudnnHandle, rnnDesc, layer, weightSpaceSize, - weights_[0].get_ptr(), // weightSpace, - linLayerID, wDesc, - (void**)&linLayerMat, //.get_ptr(), - bDesc, - (void**)&linLayerBias //.get_ptr() - )); - - if (linLayerMat) { - HCTR_LIB_THROW(cudnnGetTensorNdDescriptor(wDesc, 3, &data_type, &nbDims, dim, stride)); - size_t w = dim[0] * dim[1] * dim[2]; - T* h_weights = new T[w]; - HCTR_LIB_THROW(cudaMemcpy(h_weights, linLayerMat, sizeof(T) * w, cudaMemcpyDeviceToHost)); - - HCTR_LOG(INFO, ROOT, "W_%d %zu ", linLayerID, w); - for (unsigned int i = 0; i < w; i++) { - HCTR_PRINT(INFO, "%f ", h_weights[i]); - } - HCTR_PRINT(INFO, "\n"); - - delete[] h_weights; - } - - if (linLayerBias) { - HCTR_LIB_THROW(cudnnGetTensorNdDescriptor(bDesc, 3, &data_type, &nbDims, dim, stride)); - size_t w = dim[0] * dim[1] * dim[2]; - T* h_weights = new T[w]; - HCTR_LIB_THROW(cudaMemcpy(h_weights, linLayerBias, sizeof(T) * w, cudaMemcpyDeviceToHost)); - - HCTR_LOG(INFO, ROOT, "B_%d %zu ", linLayerID, w); - for (unsigned int i = 0; i < w; i++) { - HCTR_PRINT(INFO, "%f ", h_weights[i]); - } - HCTR_PRINT(INFO, "\n"); - - delete[] h_weights; - } - } - - HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(wDesc)); - HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(bDesc)); -#endif - // CUDNN GRU - // T tmp[hiddenTensorSize]; - // HCTR_LIB_THROW(cudaMemcpy(tmp, weight + weightSpaceSize/sizeof(T), sizeof(T) * - // hiddenTensorSize, cudaMemcpyDeviceToHost)); for(size_t i=0;i