Skip to content

Commit

Permalink
[onert] Support quantize operator (int8 to uint8) (Samsung#5470)
Browse files Browse the repository at this point in the history
It supports quantize operator (int8 to uint8).

ONE-DCO-1.0-Signed-off-by: Sanggyu Lee <[email protected]>
  • Loading branch information
glistening authored Dec 22, 2020
1 parent b7e7af9 commit 490aa3c
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 11 deletions.
73 changes: 73 additions & 0 deletions compute/cker/include/cker/operation/Quantize.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,79 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size,
}
}

template <>
inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift, int32_t input_zeropoint,
int32_t output_zeropoint, uint8_t *output_data)
{
static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();

int i = 0;
#ifdef USE_NEON
// Constants.
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);

for (; i <= size - 16; i += 16)
{
const int8x16_t input_vec = vld1q_s8(input_data + i);
const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
int32x4x4_t input;
input.val[0] = vmovl_s16(vget_low_s16(first_half));
input.val[1] = vmovl_s16(vget_high_s16(first_half));
input.val[2] = vmovl_s16(vget_low_s16(second_half));
input.val[3] = vmovl_s16(vget_high_s16(second_half));
input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);

int32x4x4_t result =
MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);

result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);

const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);

const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
vst1q_u8(output_data + i, narrowed_result);
}

#endif
for (; i < size; ++i)
{
const int32_t input = input_data[i] - input_zeropoint;
const int32_t output =
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
output_zeropoint;
const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
output_data[i] = static_cast<uint8_t>(clamped_output);
}
}

} // namespace cker
} // namespace nnfw

Expand Down
15 changes: 13 additions & 2 deletions runtime/onert/backend/cpu/ops/QuantizeLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *out
{
// DO NOTHING
}
else if ((input->data_type() == OperandType::QUANT_UINT8_ASYMM) &&
(output->data_type() == OperandType::QUANT_INT8_ASYMM))
else if (((input->data_type() == OperandType::QUANT_UINT8_ASYMM) &&
(output->data_type() == OperandType::QUANT_INT8_ASYMM)) ||
((input->data_type() == OperandType::QUANT_INT8_ASYMM) &&
(output->data_type() == OperandType::QUANT_UINT8_ASYMM)))
{
const double effective_output_scale =
static_cast<double>(input->data_scale()) / static_cast<double>(output->data_scale());
Expand All @@ -80,6 +82,15 @@ void QuantizeLayer::run()
_output_shift, _input->data_offset(), _output->data_offset(),
reinterpret_cast<int8_t *>(_output->buffer()));
}
else if ((_input->data_type() == OperandType::QUANT_INT8_ASYMM) &&
(_output->data_type() == OperandType::QUANT_UINT8_ASYMM))
{
nnfw::cker::Requantize<int8_t, uint8_t>(
reinterpret_cast<const int8_t *>(_input->buffer()),
MatchingFlatSize(getTensorShape(_input), getTensorShape(_output)), _output_multiplier,
_output_shift, _input->data_offset(), _output->data_offset(),
reinterpret_cast<uint8_t *>(_output->buffer()));
}
else
{
throw std::runtime_error{"Quantize: Unsupported data type"};
Expand Down
40 changes: 31 additions & 9 deletions tests/nnfw_api/src/one_op_tests/Quantize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@

#include <memory>

CircleGen genSimpleQuantizeModel(circle::TensorType from_t, circle::TensorType to_t)
CircleGen genSimpleQuantizeModel(circle::TensorType from_t, float input_scale, int input_zeropoint,
circle::TensorType to_t, float output_scale, int output_zeropoint)
{
CircleGen cgen;
int in = cgen.addTensor({{1, 4, 4, 1}, from_t}, 1, 128);
int out = cgen.addTensor({{1, 4, 4, 1}, to_t}, 2, -10);
int in = cgen.addTensor({{1, 4, 4, 1}, from_t}, input_scale, input_zeropoint);
int out = cgen.addTensor({{1, 4, 4, 1}, to_t}, output_scale, output_zeropoint);
cgen.addOperatorQuantize({{in}, {out}});
cgen.setInputsAndOutputs({in}, {out});
return cgen;
}

TEST_F(GenModelTest, OneOp_Quantize_Uint8toInt8)
{
CircleGen cgen = genSimpleQuantizeModel(circle::TensorType_UINT8, circle::TensorType_INT8);
CircleGen cgen =
genSimpleQuantizeModel(circle::TensorType_UINT8, 1., 128, circle::TensorType_INT8, 2., -10);
_context = std::make_unique<GenModelTestContext>(cgen.finish());
_context->addTestCase(
TestCaseData{}
Expand All @@ -41,15 +43,35 @@ TEST_F(GenModelTest, OneOp_Quantize_Uint8toInt8)
SUCCEED();
}

TEST_F(GenModelTest, neg_OneOp_Quantize_Uint8toInt16)
TEST_F(GenModelTest, OneOp_Quantize_Int8toUint8)
{
CircleGen cgen = genSimpleQuantizeModel(circle::TensorType_UINT8, circle::TensorType_INT16);
CircleGen cgen =
genSimpleQuantizeModel(circle::TensorType_INT8, 2., -10, circle::TensorType_UINT8, 1., 128);
_context = std::make_unique<GenModelTestContext>(cgen.finish());
_context->addTestCase(
TestCaseData{}
.addInput<uint8_t>({127, 48, 151, 232, 56, 176, 47, 37, 51, 52, 39, 94, 1, 128, 142, 243})
.addOutput<int16_t>(
{-1, -80, 23, 104, -72, 48, -81, -91, -77, -76, -89, -34, -127, 0, 14, 115}));
.addInput<int8_t>({-10, -50, 2, 42, -46, 14, -50, -55, -48, -48, -54, -27, -66, -20, -3, 48})
.addOutput<uint8_t>({128, 48, 152, 232, 56, 176, 48, 38, 52, 52, 40, 94, 16, 108, 142, 244}));
_context->setBackends({"cpu"});
SUCCEED();
}

TEST_F(GenModelTest, neg_OneOp_Quantize_Uint8toInt16)
{
CircleGen cgen =
genSimpleQuantizeModel(circle::TensorType_UINT8, 1., 128, circle::TensorType_INT16, 2., -10);
_context = std::make_unique<GenModelTestContext>(cgen.finish());
_context->setBackends({"acl_cl", "acl_neon", "cpu"});
_context->expectFailModelLoad();

SUCCEED();
}

TEST_F(GenModelTest, neg_OneOp_Quantize_Int8toInt16)
{
CircleGen cgen =
genSimpleQuantizeModel(circle::TensorType_INT8, 2., -10, circle::TensorType_INT16, 1., 128);
_context = std::make_unique<GenModelTestContext>(cgen.finish());
_context->setBackends({"acl_cl", "acl_neon", "cpu"});
_context->expectFailModelLoad();

Expand Down

0 comments on commit 490aa3c

Please sign in to comment.