Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement L1Loss #3401

Open
wants to merge 28 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8de41ef
add kernel for l1lossreducedforward5d
cognaiger9 May 13, 2024
0d00d6d
draft for utilities
cognaiger9 May 16, 2024
005fd3c
add 3 files in include/miopen/l1loss
cognaiger9 May 16, 2024
c3ba011
pull new driver code
cognaiger9 May 17, 2024
a508bbb
Merge branch 'develop-moreh' of github.com:ngoccoder/MIOpen into impl…
cognaiger9 May 17, 2024
de7c0e6
add driver code
cognaiger9 May 17, 2024
7e2014f
fix bug related to workspace
cognaiger9 May 21, 2024
463df2b
add driver for small sized tensor, need to investigate more
cognaiger9 May 22, 2024
415c191
add gtest script
cognaiger9 May 22, 2024
9b1c403
complete gtest cpu and gpu
cognaiger9 May 23, 2024
3bd6980
draft backward phase of l1loss
cognaiger9 May 24, 2024
256c2e0
add driver
cognaiger9 May 24, 2024
4524b09
complete driver for l1loss
cognaiger9 May 27, 2024
8b2f2f5
fix bug related to bfp16 data type in gtest
cognaiger9 May 27, 2024
f2c0750
add filter for forward case
cognaiger9 May 27, 2024
5bc0dfb
add only l1loss forward reduced
cognaiger9 May 30, 2024
3609cb0
remove redundant part
cognaiger9 Jun 4, 2024
8851320
merge rocm develop
cognaiger9 Jul 30, 2024
cdf3853
update benchmark method
cognaiger9 Aug 1, 2024
eeb971d
commit change
cognaiger9 Aug 5, 2024
eb48e6f
Merge branch 'develop' of https://github.com/ROCm/MIOpen into impl_l1…
cognaiger9 Nov 20, 2024
5fa31e6
change reduction procedure, still get inf result
cognaiger9 Nov 22, 2024
6e05582
fix gtest and driver
cognaiger9 Nov 22, 2024
ce8be4f
Merge branch 'develop' of https://github.com/ROCm/MIOpen into impl_l1…
cognaiger9 Nov 22, 2024
675fe1c
Merge branch 'develop' of https://github.com/ROCm/MIOpen into impl_l1…
cognaiger9 Nov 25, 2024
bd01023
Merge branch 'develop' into impl_l1loss_merged
long10024070 Dec 3, 2024
aa1e986
Merge branch 'develop' into impl_l1loss_merged
cognaiger9 Dec 12, 2024
e113fc8
Merge branch 'develop' into impl_l1loss_merged
long10024070 Dec 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix bug related to workspace
  • Loading branch information
cognaiger9 committed May 21, 2024
commit 7e2014f3432480ad6ae35204325ec3de8e76a500
8 changes: 4 additions & 4 deletions driver/dm_l1loss.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@

static Driver* makeDriver(const std::string& base_arg)
{
if(base_arg == "smoothl1loss")
if(base_arg == "l1loss")
return new L1LossDriver<float, float>();
if(base_arg == "smoothl1lossfp16")
if(base_arg == "l1lossfp16")
return new L1LossDriver<float16, float>();
if(base_arg == "smoothl1lossbfp16")
if(base_arg == "l1lossbfp16")
return new L1LossDriver<bfloat16, float>();
return nullptr;
}

REGISTER_DRIVER_MAKER(makeDriver);
REGISTER_DRIVER_MAKER(makeDriver);
3 changes: 2 additions & 1 deletion driver/driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" &&
arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
arg != "catfp16" && arg != "catbfp16" && arg != "l1loss" && arg != "l1lossfp16" && arg != "l1lossbfp16" && arg != "--version")
arg != "catfp16" && arg != "catbfp16" && arg != "l1loss" && arg != "l1lossfp16" &&
arg != "l1lossbfp16" && arg != "--version")
{
printf("FAILED: Invalid Base Input Argument\n");
Usage();
Expand Down
92 changes: 52 additions & 40 deletions driver/l1loss_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
#define MLO_L1LOSSMHOST_H_

template <typename Tgpu, typename Tcheck>
int32_t mloSmoothL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc,
int32_t mloL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc,
const miopenTensorDescriptor_t tDesc,
const Tgpu* input,
const Tgpu* target,
Expand All @@ -63,18 +63,20 @@ int32_t mloSmoothL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDes
int32_t divisor = (reduction == MIOPEN_L1LOSS_MEAN_REDUCTION) ? size : 1;

/* Phase 1: Calc loss for each element. */
for (size_t i = 0; i < size; i++) {
for(size_t i = 0; i < size; i++)
{
uint64_t n[5];
GET_NCDHW(n[0], n[1], n[2], n[3], n[4], i, I_tv);
uint64_t Iidx = TV5D_IDX(I_tv, n[0], n[1], n[2], n[3], n[4]);
uint64_t Tidx = TV5D_IDX(T_tv, n[0], n[1], n[2], n[3], n[4]);
uint64_t Iidx = TV5D_IDX(I_tv, n[0], n[1], n[2], n[3], n[4]);
uint64_t Tidx = TV5D_IDX(T_tv, n[0], n[1], n[2], n[3], n[4]);
workspacehost[Iidx] = abs(input[Iidx] - target[Tidx]) / divisor;
}

/* Phase 2: Reduce */
double output = 0.0;
for (size_t i = 0; i < size; i++) {
output += workspacehost[i];
for(size_t i = 0; i < size; i++)
{
output += workspacehost[i];
}
outputhost[0] = output;

Expand Down Expand Up @@ -198,28 +200,34 @@ int L1LossDriver<Tgpu, Tref>::GetandSetData()
reduction = static_cast<miopenL1LossReduction_t>(inflags.GetValueInt("Reduction"));

auto length = GetTensorLengthsFromCmdLine();
auto in_strides = GetStrides(length, 1);
auto tar_strides = GetStrides(length, inflags.GetValueInt("Contiguous"));
//auto in_strides = GetStrides(length, 1);
//auto tar_strides = GetStrides(length, inflags.GetValueInt("Contiguous"));

SetTensorNd(inputDesc, length, in_strides, data_type);
SetTensorNd(targetDesc, length, tar_strides, data_type);
//SetTensorNd(inputDesc, length, in_strides, data_type);
//SetTensorNd(targetDesc, length, tar_strides, data_type);
SetTensorNd(inputDesc, length, data_type);
SetTensorNd(targetDesc, length, data_type);

if(reduction == MIOPEN_L1LOSS_NONE_REDUCTION)
{
SetTensorNd(outputDesc, length, in_strides, data_type);
//SetTensorNd(outputDesc, length, in_strides, data_type);
SetTensorNd(outputDesc, length, data_type);
}
else
{
std::vector<int> out_lens = {1};
SetTensorNd(outputDesc, out_lens, data_type);
}

SetTensorNd(diDesc, length, in_strides, data_type);
SetTensorNd(dtDesc, length, tar_strides, data_type);
//SetTensorNd(diDesc, length, in_strides, data_type);
//SetTensorNd(dtDesc, length, tar_strides, data_type);
SetTensorNd(diDesc, length, data_type);
SetTensorNd(dtDesc, length, data_type);

if(reduction == MIOPEN_L1LOSS_NONE_REDUCTION)
{
SetTensorNd(doDesc, length, in_strides, data_type);
//SetTensorNd(doDesc, length, in_strides, data_type);
SetTensorNd(doDesc, length, data_type);
}
else
{
Expand Down Expand Up @@ -251,7 +259,7 @@ int L1LossDriver<Tgpu, Tref>::AddCmdLineArgs()
"(Default=0)",
"int");
inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
inflags.AddInputFlag(
"wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
Expand Down Expand Up @@ -301,13 +309,15 @@ int L1LossDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
size_t in_sz = GetTensorSize(inputDesc);
size_t tar_sz = GetTensorSize(targetDesc);
size_t out_sz = GetTensorSize(outputDesc);
size_t ws_sz = GetTensorSize(inputDesc);

miopenGetL1LossForwardWorkspaceSize(GetHandle(), reduction, inputDesc, targetDesc, outputDesc, &ws_sizeInBytes);
miopenGetL1LossForwardWorkspaceSize(
GetHandle(), reduction, inputDesc, targetDesc, outputDesc, &ws_sizeInBytes);

if(ws_sizeInBytes == static_cast<size_t>(-1))
return miopenStatusAllocFailed;

size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu);

uint32_t ctx = 0;

in_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
Expand Down Expand Up @@ -368,16 +378,16 @@ int L1LossDriver<Tgpu, Tref>::RunForwardGPU()

for(int i = 0; i < inflags.GetValueInt("iter"); i++)
{
miopenSmoothL1LossForward(GetHandle(),
reduction,
workspace_dev->GetMem(),
ws_sizeInBytes,
inputDesc,
in_dev->GetMem(),
targetDesc,
tar_dev->GetMem(),
outputDesc,
out_dev->GetMem());
miopenL1LossForward(GetHandle(),
reduction,
workspace_dev->GetMem(),
ws_sizeInBytes,
inputDesc,
in_dev->GetMem(),
targetDesc,
tar_dev->GetMem(),
outputDesc,
out_dev->GetMem());

float time = 0.0;
miopenGetKernelTime(GetHandle(), &time);
Expand Down Expand Up @@ -411,22 +421,22 @@ int L1LossDriver<Tgpu, Tref>::RunForwardCPU()
{
if(reduction == MIOPEN_L1LOSS_MEAN_REDUCTION || reduction == MIOPEN_L1LOSS_SUM_REDUCTION)
{
mloSmoothL1LossReducedForwardRunHost<Tgpu, Tref>(inputDesc,
targetDesc,
in.data(),
tar.data(),
workspacehost.data(),
outhost.data(),
reduction);
mloL1LossReducedForwardRunHost<Tgpu, Tref>(inputDesc,
targetDesc,
in.data(),
tar.data(),
workspacehost.data(),
outhost.data(),
reduction);
}

return miopenStatusSuccess;
}

/*
template <typename Tgpu, typename Tref>
int L1LossDriver<Tgpu, Tref>::RunBackwardGPU()
{
/*
float kernel_total_time = 0;
float kernel_first_time = 0;

Expand Down Expand Up @@ -478,13 +488,15 @@ int L1LossDriver<Tgpu, Tref>::RunBackwardGPU()
std::cerr << "Error copying (dI_dev) from GPU, size: " << dI_dev->GetSize() << std::endl;
if(dT_dev->FromGPU(GetStream(), dT.data()) != 0)
std::cerr << "Error copying (dT_dev) from GPU, size: " << dT_dev->GetSize() << std::endl;
*/

return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref>
int L1LossDriver<Tgpu, Tref>::RunBackwardCPU()
{
/*
if(!std::isnan(divisor))
{
mloSmoothL1LossReducedBackwardRunHost<Tgpu, Tref>(inputDesc,
Expand All @@ -499,10 +511,10 @@ int L1LossDriver<Tgpu, Tref>::RunBackwardCPU()
beta,
divisor);
}
*/

return miopenStatusSuccess;
}
*/

template <typename Tgpu, typename Tref>
Tref L1LossDriver<Tgpu, Tref>::GetTolerance()
Expand Down Expand Up @@ -531,17 +543,17 @@ int L1LossDriver<Tgpu, Tref>::VerifyForward()
}
else
{
std::cout << "Forward L1Loss Verifies OK on CPU reference (" << error << " < "
<< tolerance << ')' << std::endl;
std::cout << "Forward L1Loss Verifies OK on CPU reference (" << error << " < " << tolerance
<< ')' << std::endl;
}

return miopenStatusSuccess;
}

/*
template <typename Tgpu, typename Tref>
int L1LossDriver<Tgpu, Tref>::VerifyBackward()
{
/*
RunBackwardCPU();
const Tref tolerance = GetTolerance();
auto error_dI = miopen::rms_range(dIhost, dI);
Expand All @@ -559,9 +571,9 @@ int L1LossDriver<Tgpu, Tref>::VerifyBackward()
std::cout << "Backward SmoothL1Loss Verifies OK on CPU reference ({" << error_dI << ","
<< error_dT << "} < " << tolerance << ')' << std::endl;
}
*/

return miopenStatusSuccess;
}
*/

#endif // GUARD_MIOPEN_L1LOSS_DRIVER_HPP
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ set( MIOpen_Source
invoker_cache.cpp
kernel_build_params.cpp
kernel_warnings.cpp
l1loss.cpp
l1loss_api.cpp
l1loss/problem_description.cpp
layernorm_api.cpp
layernorm/problem_description.cpp
load_file.cpp
Expand Down Expand Up @@ -260,6 +263,7 @@ set( MIOpen_Source
solver/gemm_bwd.cpp
solver/gemm_wrw.cpp
solver/groupnorm/forward_groupnorm.cpp
solver/l1loss/forward_l1loss.cpp
solver/layernorm/forward_layernorm.cpp
solver/layernorm/forward_layernorm2d_ck.cpp
solver/layernorm/forward_layernorm4d_ck.cpp
Expand Down Expand Up @@ -421,6 +425,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
kernels/neuron.inc
kernels/rocm_version.inc
kernels/stride_array.hpp
kernels/tensor_view_5d.hpp
kernels/utilities.inc
kernels/workaround_issue_1431.hpp
kernels/xform_bidirect_winograd_code.inc
Expand Down Expand Up @@ -455,6 +460,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
kernels/MIOpenConvDirBatchNormActiv.cl
kernels/MIOpenConvDirGenFwd.cl
kernels/MIOpenGroupNorm.cpp
kernels/MIOpenL1Loss.cpp
kernels/MIOpenLayerNorm.cpp
kernels/MIOpenLRNBwd.cl
kernels/MIOpenLRNFwd.cl
Expand Down
2 changes: 1 addition & 1 deletion src/include/miopen/l1loss/problem_description.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ struct L1LossFwdProblemDescription : ProblemDescriptionBase
}
}

miopenL1LossReduction_t GetReduction_() const { return reduction; }
miopenL1LossReduction_t GetReduction() const { return reduction; }
const TensorDescriptor& GetIDesc() const { return iDesc; }
const TensorDescriptor& GetTDesc() const { return tDesc; }
const TensorDescriptor& GetODesc() const { return oDesc; }
Expand Down
1 change: 1 addition & 0 deletions src/include/miopen/l1loss/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ struct L1LossForward5d final : L1LossForwardSolverBase
std::size_t
GetWorkspaceSize(const ExecutionContext& context,
const miopen::l1loss::L1LossFwdProblemDescription& problem) const override;
bool MayNeedWorkspace() const override { return true; }
};

/*
Expand Down
3 changes: 2 additions & 1 deletion src/include/miopen/solver_id.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ enum class Primitive
Reduce,
Cat,
Mha,
Softmax
Softmax,
L1Loss
};

struct MIOPEN_EXPORT Id
Expand Down
2 changes: 0 additions & 2 deletions src/kernels/MIOpenL1Loss.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
* SOFTWARE.
*
*******************************************************************************/
#include <__clang_hip_runtime_wrapper.h>
#include <cstddef>
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
Expand Down
18 changes: 10 additions & 8 deletions src/l1loss.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
*
*******************************************************************************/

#include "miopen/l1loss/problem_description.hpp"
#include "miopen/miopen.h"
#include <miopen/datatype.hpp>
#include <miopen/find_solution.hpp>
Expand All @@ -37,23 +38,24 @@
namespace miopen {

size_t GetL1LossForwardWorkspaceSize(Handle& handle,
miopenL1LossReduction_t reduction,
const TensorDescriptor& iDesc,
const TensorDescriptor& tDesc,
const TensorDescriptor& oDesc)
{
auto ctx = ExecutionContext{&handle};
const auto problem = smoothl1loss::ReducedForwardProblemDescription{iDesc, tDesc, oDesc};
const auto problem = l1loss::L1LossFwdProblemDescription{iDesc, tDesc, oDesc, reduction};

const auto algo = AlgorithmName{"SmoothL1LossReducedForward"};
const auto algo = AlgorithmName{"L1LossForward"};
const auto solvers =
solver::SolverContainer<solver::smoothl1loss::SmoothL1LossReducedForward5d>{};
solver::SolverContainer<solver::l1loss::L1LossForward5d>{};

auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);

return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
}

miopenStatus_t SmoothL1LossForward(Handle& handle,
miopenStatus_t L1LossForward(Handle& handle,
miopenL1LossReduction_t reduction,
Data_t workspace,
size_t workspaceSizeInBytes,
Expand All @@ -64,10 +66,10 @@ miopenStatus_t SmoothL1LossForward(Handle& handle,
const TensorDescriptor& oDesc,
Data_t o)
{
const auto problem = l1loss::ReducedForwardProblemDescription{iDesc, tDesc, oDesc};
const auto problem = l1loss::L1LossFwdProblemDescription{iDesc, tDesc, oDesc, reduction};

const auto invoke_params = [&]() {
auto tmp = smoothl1loss::InvokeParams{};
auto tmp = l1loss::InvokeParams{};
tmp.type = InvokeType::Run;
tmp.iDesc = &iDesc;
tmp.tDesc = &tDesc;
Expand All @@ -80,9 +82,9 @@ miopenStatus_t SmoothL1LossForward(Handle& handle,
return tmp;
}();

const auto algo = AlgorithmName{"SmoothL1LossReducedForward"};
const auto algo = AlgorithmName{"L1LossForward"};
const auto solvers =
solver::SolverContainer<solver::smoothl1loss::SmoothL1LossReducedForward5d>{};
solver::SolverContainer<solver::l1loss::L1LossForward5d>{};

solvers.ExecutePrimitive(handle, problem, algo, invoke_params);

Expand Down
Loading