diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 16ff20aa1b1..c0667f7a4d5 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -87,6 +87,10 @@ void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, Matrix void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d); void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size); void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power); +void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv, + const float *ov, const float* od, MatrixDim id_dim, + int iv_stride, int ov_stride, int od_stride, + int group_size, float power); void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size); void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d); void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans); @@ -234,6 +238,10 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, Matr void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d); void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size); void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power); +void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, const double *iv, + const double *ov, const double* od, + MatrixDim id_dim, int iv_stride, int ov_stride, + int od_stride, int group_size, double power); void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size); void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d); void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index c49eb5808ca..b916e1eef74 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -435,6 +435,55 @@ static void _calc_pnorm_deriv(Real *deriv, const Real *vec, const Real *norm, } } + +template +__global__ +void _diff_group_pnorm(Real *id, const Real *iv, const Real *ov, const Real* od, + MatrixDim id_dim, int iv_stride, int ov_stride, + int od_stride, int group_size, Real power) { + const int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < id_dim.cols) { + const int grid_stride = gridDim.y * blockDim.y; + const int src_j = j / group_size; + int i = blockIdx.y * blockDim.y + threadIdx.y; + for (; i < id_dim.rows; i += grid_stride) { + const int iv_index = j + i * iv_stride; + Real iv_ij = iv[iv_index]; + Real ans; + if (power == Real(2)) { + const int ov_index = src_j + i * ov_stride; + Real ov_ij = ov[ov_index]; + ans = ov_ij <= 0.0 ? 0.0 : iv_ij / ov_ij; + } else if (power == Real(1)) { + Real iv_ij_sign = (iv_ij >= 0 ? 1 : -1); + ans = (iv_ij == Real(0) ? 0.0 : iv_ij_sign); + } else if (power + == (sizeof(Real) == sizeof(float) ? CUDART_INF_F : CUDART_INF)) { + const int ov_index = src_j + i * ov_stride; + Real ov_ij = ov[ov_index]; + Real iv_ij_sign = (iv_ij >= 0 ? 1 : -1); + ans = + ov_ij <= 0.0 ? + 0.0 : (iv_ij_sign * (abs(iv_ij) == ov_ij ? 1.0 : 0.0)); + } else { + const int ov_index = src_j + i * ov_stride; + Real ov_ij = ov[ov_index]; + Real iv_ij_sign = (iv_ij >= 0 ? 1 : -1); + if (ov_ij <= 0.0) { + ans = 0.0; // The derivative is either zero or undefined at the origin. + } else { + ans = iv_ij_sign * pow(std::abs(iv_ij), power - 1) + * pow(ov_ij, 1 - power); + } + } + const int od_index = src_j + i * od_stride; + const int id_index = j + i * id_dim.stride; + id[id_index] = ans * od[od_index]; + } + } +} + + /// deriv is the derivative we will output; vec is the input we're computing /// the group max on, "maxv" is the previously computed group max. template @@ -1962,7 +2011,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { // find max element of the row // reduce to CU1DBLOCK elements per row. - Real tmax = Real(-1.0 / 0.0); + Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF; for (int j = tid; j < d.cols; j += CU1DBLOCK) { tmax = max(tmax, x[x_start + j]); } @@ -2534,6 +2583,14 @@ void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, _calc_pnorm_deriv<<>>(y, x1, x2, d, src_stride, group_size, power); } +void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv, + const float *ov, const float* od, MatrixDim id_dim, + int iv_stride, int ov_stride, int od_stride, + int group_size, float power) { + _diff_group_pnorm<<>>(id, iv, ov, od, id_dim, iv_stride, ov_stride, + od_stride, group_size, power); +} + void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size) { @@ -3033,6 +3090,14 @@ void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, _calc_pnorm_deriv<<>>(y, x1, x2, d, src_stride, group_size, power); } +void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, const double *iv, + const double *ov, const double* od, + MatrixDim id_dim, int iv_stride, int ov_stride, + int od_stride, int group_size, double power) { + _diff_group_pnorm<<>>(id, iv, ov, od, id_dim, iv_stride, ov_stride, + od_stride, group_size, power); +} + void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, const double* x2, MatrixDim d, int src_stride, int group_size) { diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 7cf9a8fa115..484227ec041 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -167,6 +167,14 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); } inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); } inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); } +inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv, + const float *ov, const float* od, + MatrixDim id_dim, int iv_stride, + int ov_stride, int od_stride, int group_size, + float power) { + cudaF_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride, + od_stride, group_size, power); +} inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size) {cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); } inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); } inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } @@ -365,6 +373,14 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); } inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); } inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); } +inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, + const double *iv, const double *ov, + const double* od, MatrixDim id_dim, + int iv_stride, int ov_stride, int od_stride, + int group_size, double power) { + cudaD_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride, + od_stride, group_size, power); +} inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size) {cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); } inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); } inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc index 5a9d3e1eb2f..f815f0b720f 100644 --- a/src/cudamatrix/cu-matrix-speed-test.cc +++ b/src/cudamatrix/cu-matrix-speed-test.cc @@ -547,6 +547,26 @@ template void TestCuMatrixGroupPnormDeriv(int32 dim) { << dim << ", speed was " << gflops << " gigaflops."; } +template void TestCuMatrixDiffGroupPnorm(int32 dim) { + BaseFloat time_in_secs = 0.025; + int32 group_size = 8; + CuMatrix iv(dim, dim), ov(dim, dim / group_size); + CuMatrix id(dim, dim), od(dim, dim / group_size); + iv.SetRandn(); + od.SetRandn(); + ov.GroupPnorm(iv, 2.0); + Timer tim; + int32 iter = 0; + + for (; tim.Elapsed() < time_in_secs; iter++) + id.DiffGroupPnorm(iv, ov, od, 2.0); + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::DiffGroupPnorm" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixGroupMax(int32 dim) { BaseFloat time_in_secs = 0.025; int32 group_size = 4; @@ -1007,6 +1027,8 @@ template void CudaMatrixSpeedTest() { TestCuMatrixGroupPnorm(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixGroupPnormDeriv(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixDiffGroupPnorm(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixGroupMax(sizes[s]); for (int32 s = 0; s < ns; s++) diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index e20ae324db2..3a247c5d298 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -955,6 +955,46 @@ static void UnitTestCuMatrixGroupPnormDeriv() { AssertEqual(Hr,Hr2); } +template +static void UnitTestCuMatrixDiffGroupPnorm() { + Real p[] = { 1.234, 2.345, 1, 2, std::numeric_limits::infinity() }; + for (int i = 0; i < 2 * sizeof(p) / sizeof(Real); i++) { + int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200; + int32 group_size = 1 + Rand() % 10; + BaseFloat power = p[i / 2]; + int32 dimN = group_size * dimNs; + Matrix Hiv(dimM, dimN); + Matrix Hov(dimM, dimNs); + Matrix Hid(dimM, dimN); + Matrix Hod(dimM, dimNs); + Hiv.SetRandn(); + Hod.SetRandn(); + if (i % 2 == 0) + Hiv.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to + // do derivatives. + Hov.GroupPnorm(Hiv, power); + CuMatrix Div(dimM, dimN); + CuMatrix Dov(dimM, dimNs); + CuMatrix Did(dimM, dimN); + CuMatrix Dod(dimM, dimNs); + Div.CopyFromMat(Hiv); + Dod.CopyFromMat(Hod); + Dov.CopyFromMat(Hov); + + // GPU + Did.DiffGroupPnorm(Div, Dov, Dod, power); + + // CPU + Hid.GroupPnormDeriv(Hiv, Hov, power); + Hid.MulRowsGroupMat(Hod); + + Matrix Hid2(dimM, dimN); + Did.CopyToMat(&Hid2); + AssertEqual(Hid, Hid2); + } +} + + template static void UnitTestCuMatrixGroupMaxDeriv() { int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200; @@ -2580,6 +2620,7 @@ template void CudaMatrixUnitTest() { UnitTestCuDiffSoftmax(); UnitTestCuMatrixGroupPnorm(); UnitTestCuMatrixGroupPnormDeriv(); + UnitTestCuMatrixDiffGroupPnorm(); UnitTestCuMatrixGroupMax(); UnitTestCuMatrixGroupMaxDeriv(); UnitTestCuMatrixMulRowsVec(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 68875ca1de6..d23e1a41802 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -822,6 +822,39 @@ void CuMatrixBase::GroupPnormDeriv(const CuMatrixBase &src1, } } +template +void CuMatrixBase::DiffGroupPnorm(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + Real power) { + KALDI_ASSERT(out_value.NumCols() > 0); + KALDI_ASSERT(out_value.NumCols() == out_deriv.NumCols()); + int group_size = this->NumCols() / out_value.NumCols(); + KALDI_ASSERT(this->NumCols() == out_value.NumCols() * group_size); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + const int kWarpSize = 32; + dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); + dim3 dimGrid(n_blocks(NumCols(), dimBlock.x), + n_blocks(NumRows(), dimBlock.y)); + if (dimGrid.x * dimGrid.y > 1024) { + dimGrid.y = std::max(1024 / dimGrid.x, unsigned(1)); + } + cuda_diff_group_pnorm(dimGrid, dimBlock, this->data_, in_value.Data(), + out_value.Data(), out_deriv.Data(), Dim(), + in_value.Stride(), out_value.Stride(), + out_deriv.Stride(), group_size, power); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + GroupPnormDeriv(in_value, out_value, power); + MulRowsGroupMat(out_deriv); + } +} + template void CuMatrixBase::GroupMaxDeriv(const CuMatrixBase &src1, const CuMatrixBase &src2) { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 836af860696..3b0a0dc23f3 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -279,6 +279,12 @@ class CuMatrixBase { void GroupPnormDeriv(const CuMatrixBase &input, const CuMatrixBase &output, Real power); + /// Differentiate backward through the GroupPnorm function. + /// It is a combination of GroupPnormDeriv and MulRowsGroupMax. + void DiffGroupPnorm(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, Real power); + /// Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j /// where G = x.NumCols() / y.NumCols() must be an integer. /// [note: y corresponds to *this and x to src, so diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index 3fa909f36d3..51bf4522124 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -1105,6 +1105,17 @@ void MatrixBase::GroupPnormDeriv(const MatrixBase &input, (*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1)); } } + } else if (power == std::numeric_limits::infinity()) { + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real output_val = output(i, j / group_size), input_val = input(i, j); + if (output_val == 0) + (*this)(i, j) = 0; + else + (*this)(i, j) = (std::abs(input_val) == output_val ? 1.0 : 0.0) + * (input_val >= 0 ? 1 : -1); + } + } } else { for (MatrixIndexT i = 0; i < num_rows; i++) { for (MatrixIndexT j = 0; j < num_cols; j++) { diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc index f59a0c4ebea..62713126007 100644 --- a/src/nnet2/nnet-component.cc +++ b/src/nnet2/nnet-component.cc @@ -531,8 +531,7 @@ void PnormComponent::Backprop(const ChunkInfo &, // in_info, // may be identical to "this". CuMatrix *in_deriv) const { in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero); - in_deriv->GroupPnormDeriv(in_value, out_value, p_); - in_deriv->MulRowsGroupMat(out_deriv); + in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p_); } void PnormComponent::Read(std::istream &is, bool binary) { diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index bc6a122c3bd..b28c24a613c 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -52,7 +52,7 @@ void PnormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { BaseFloat p = 2.0; - out->GroupPnorm(in, p); // TODO: when done, replace with Group2Norm function. + out->GroupPnorm(in, p); } void PnormComponent::Backprop(const std::string &debug_info, @@ -62,11 +62,10 @@ void PnormComponent::Backprop(const std::string &debug_info, const CuMatrixBase &out_deriv, Component *to_update, CuMatrixBase *in_deriv) const { - if (!in_deriv) return; + if (!in_deriv) + return; BaseFloat p = 2.0; - // TODO: use Group2NormDeriv when done. - in_deriv->GroupPnormDeriv(in_value, out_value, p); - in_deriv->MulRowsGroupMat(out_deriv); + in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p); } void PnormComponent::Read(std::istream &is, bool binary) {