Skip to content

Commit

Permalink
[PyTorch Edge] Add OwnedOrBorrowedVector for QNNPack BCSR Indices/Val…
Browse files Browse the repository at this point in the history
…ues (pytorch#80476)

OwnedOrBorrowedVector allows us to create a BCSR which uses preexisting data pointers for its indices and values, so we can avoid copying data when loading. It also supports creating a BCSR which owns indices and values data in vectors as before.

Differential Revision: [D36956640](https://our.internmc.facebook.com/intern/diff/D36956640/)

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D36956640/)!
Pull Request resolved: pytorch#80476
Approved by: https://github.com/qihqi
  • Loading branch information
salilsdesai authored and pytorchmergebot committed Jul 7, 2022
1 parent 5c12cd2 commit 98e4524
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ using BCSRSerializationType =
bool, // Quantization Scheme (true: per tensor, false: per channel)
at::Tensor, // Wrapper for Row Block Indices (int32_t)
at::Tensor, // Wrapper for Column Block Indices (int32_t)
at::Tensor, // Wrapper for Non-Zero Weight Values (int8_t)
at::Tensor, // Wrapper for Non-Zero Weight Values, each +128 (uint8_t)
int64_t, // Number of Output Channels
int64_t // Number of Input Channels
>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,28 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::deserialize(
device(c10::kCPU).dtype(c10::kQInt8));
}

const at::Tensor loaded_weight_values =
std::get<weight_values_index>(serialized);
const uint8_t* loaded_weight_values_ptr =
loaded_weight_values.data_ptr<uint8_t>();
const int64_t loaded_weight_values_size = loaded_weight_values.numel();
// Subtract 128 because we serialize as +128, which s best for
// minimizing memory footprint for QNNPack
std::vector<int8_t> weight_values(loaded_weight_values_size);
std::transform(
loaded_weight_values_ptr,
loaded_weight_values_ptr + loaded_weight_values_size,
weight_values.begin(),
[](uint8_t v) {
return static_cast<int8_t>(static_cast<int16_t>(v) - 128);
});

// Unpack as non backend specific untiled BCSR then pack as Fbgemm tiled BCSR
// because untiled Fbgemm BCSR currently doesn't exist
unpack_bcsr(
reinterpret_cast<int8_t*>(weight_origin.data_ptr<c10::qint8>()),
ao::sparse::BCSR(
unwrap_vector<int8_t, int8_t>(
std::get<weight_values_index>(serialized)), // Weight Values
std::move(weight_values),
unwrap_vector<int32_t, int32_t>(
std::get<row_block_indices_index>(serialized)), // Row Indices
unwrap_vector<int32_t, int32_t>(
Expand Down Expand Up @@ -219,19 +234,19 @@ PackedLinearWeightQnnp::PackedLinearWeightQnnp(
TORCH_CHECK(false, "Unsupported quantization scheme.");
}

const at::Tensor& row_block_indices =
deserialized_bcsr_row_block_indices_ =
std::get<row_block_indices_index>(serialized);
const at::Tensor& col_block_indices =
deserialized_bcsr_col_block_indices_ =
std::get<col_block_indices_index>(serialized);
const at::Tensor& weight_values = std::get<weight_values_index>(serialized);
deserialized_bcsr_weight_values_ = std::get<weight_values_index>(serialized);

bcsr_matrix_ = qnnpack::generateBlockCSRMatrix(
col_block_indices.data_ptr<int32_t>(),
row_block_indices.data_ptr<int32_t>(),
weight_values.data_ptr<int8_t>(),
col_block_indices.numel(),
row_block_indices.numel(),
weight_values.numel(),
(uint32_t*)deserialized_bcsr_col_block_indices_.data_ptr<int32_t>(),
(uint32_t*)deserialized_bcsr_row_block_indices_.data_ptr<int32_t>(),
deserialized_bcsr_weight_values_.data_ptr<uint8_t>(),
deserialized_bcsr_col_block_indices_.numel(),
deserialized_bcsr_row_block_indices_.numel(),
deserialized_bcsr_weight_values_.numel(),
out_features_block_size_,
in_features_block_size_);
}
Expand Down
44 changes: 26 additions & 18 deletions aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,17 @@ namespace ao {
namespace sparse {

namespace {
/** Wrap a vector in a Tensor, copying data into its own data pointer */
template <typename UNDERLYING_DTYPE, typename T, typename U>
at::Tensor wrap_vector(std::vector<T, U>& vec, c10::ScalarType dtype) {
/**
- Wrap a vector in a Tensor, copying data into its own data pointer.
- The type of vec is T& (not vector<T>&) so this works with any vector-like
datastructure which has .data() and .size()
*/
template <typename UNDERLYING_DTYPE, typename T>
at::Tensor wrap_vector(T& vec, c10::ScalarType dtype) {
at::Tensor t = at::empty(
{static_cast<long>(vec.size())}, at::device(c10::kCPU).dtype(dtype));
std::copy(vec.begin(), vec.end(), t.data_ptr<UNDERLYING_DTYPE>());
std::copy(
vec.data(), vec.data() + vec.size(), t.data_ptr<UNDERLYING_DTYPE>());
return t;
}

Expand Down Expand Up @@ -110,6 +115,21 @@ BCSRSerializationType PackedLinearWeight::serialize() {
zero_points.data_ptr<int8_t>(),
qscheme_per_tensor);

std::vector<int8_t>& packed_weight_values = std::get<0>(untiled_bcsr);
// Add 128 to each weight value. This serialization format is best for
// minimizing memory footprint for QNNPack

at::Tensor weight_values = at::empty(
{static_cast<long>(packed_weight_values.size())},
at::device(c10::kCPU).dtype(c10::kByte));
std::transform(
packed_weight_values.begin(),
packed_weight_values.end(),
weight_values.data_ptr<uint8_t>(),
[](int8_t v) {
return static_cast<uint8_t>(static_cast<int16_t>(v) + 128);
});

return BCSRSerializationType(
SPARSE_LINEAR_PACKED_PARAM_SERIALIZATION_VERSION,
bias_,
Expand All @@ -124,8 +144,7 @@ BCSRSerializationType PackedLinearWeight::serialize() {
std::get<1>(untiled_bcsr), c10::kInt), // Row block indices
wrap_vector<int>(
std::get<2>(untiled_bcsr), c10::kInt), // Col block indices
wrap_vector<int8_t>(
std::get<0>(untiled_bcsr), c10::kChar), // Weight values
std::move(weight_values),
w->R,
w->C);
}
Expand Down Expand Up @@ -174,17 +193,6 @@ BCSRSerializationType PackedLinearWeightQnnp::serialize() {
TORCH_CHECK(false, "Unsupported quantization scheme.");
}

// Subtract 128 from each weight value, to reverse addition done during
// prepacking
at::Tensor weight_values = at::empty(
{static_cast<long>(bcsr_matrix_->values.size())},
at::device(c10::kCPU).dtype(c10::kChar));
std::transform(
bcsr_matrix_->values.begin(),
bcsr_matrix_->values.end(),
weight_values.data_ptr<int8_t>(),
subtract_128);

return BCSRSerializationType(
SPARSE_LINEAR_PACKED_PARAM_SERIALIZATION_VERSION,
orig_bias_,
Expand All @@ -197,7 +205,7 @@ BCSRSerializationType PackedLinearWeightQnnp::serialize() {
bcsr_matrix_->row_values, c10::kInt), // Casting from uint32_t to int
wrap_vector<int>(
bcsr_matrix_->col_indices, c10::kInt), // Casting from uint32_t to int
std::move(weight_values),
wrap_vector<uint8_t>(bcsr_matrix_->values, c10::kByte),
output_channels_,
input_channels_);
}
Expand Down
7 changes: 7 additions & 0 deletions aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ struct TORCH_API PackedLinearWeightQnnp
sparse_linear_op_{nullptr};
int64_t output_channels_;
int64_t input_channels_;
// Deserialized Tensors are stored to maintain the lifetime of underlying
// BCSR data.
// These are left empty if PackedLinearWeightQnnp is created via prepacking
// rather than deserializing.
at::Tensor deserialized_bcsr_row_block_indices_;
at::Tensor deserialized_bcsr_col_block_indices_;
at::Tensor deserialized_bcsr_weight_values_;

at::Tensor apply(
const at::Tensor& input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
*/

#pragma once
#include <cassert>
#include <cstdint>
#include <memory>
#include <vector>
#include <cassert>

#ifndef _WIN32
#include <qnnpack/AlignedAllocator.h>
Expand All @@ -20,16 +20,60 @@

namespace qnnpack {

typedef struct BCSRMatrix {
template <typename T>
struct OwnedOrBorrowedVector {
using VECTOR_T =
#ifndef _WIN32
std::vector<uint32_t, AlignedAllocator<uint32_t, 16>> col_indices;
std::vector<uint32_t, AlignedAllocator<uint32_t, 16>> row_values;
std::vector<uint8_t, AlignedAllocator<uint8_t, 16>> values;
std::vector<T, AlignedAllocator<T, 16>>;
#else
std::vector<uint32_t> col_indices;
std::vector<uint32_t> row_values;
std::vector<uint8_t> values;
std::vector<T>;
#endif

// Only one of owned_vec_data_ or borrowed_tuple_data_ will be meaningfully
// populated.
// A union could potentially be used here to reduce memory usage.
// std::variant is not used here because it causes internal build errors
// due to incompatibility.
VECTOR_T owned_vec_data_;
std::tuple<T*, uint32_t> borrowed_tuple_data_;
bool owned;

VECTOR_T& vector() {
assert(owned);
return owned_vec_data_;
}

uint32_t size() const {
if (owned) {
return owned_vec_data_.size();
} else {
return std::get<1>(borrowed_tuple_data_);
}
}

const T* data() const {
if (owned) {
return owned_vec_data_.data();
} else {
return std::get<0>(borrowed_tuple_data_);
}
}

const T& operator[](int i) const {
return data()[i];
}

OwnedOrBorrowedVector() : owned(true) {}

OwnedOrBorrowedVector(T* data_ptr, const uint32_t size)
: borrowed_tuple_data_(std::tuple<T*, uint32_t>(data_ptr, size)),
owned(false) {}
};

typedef struct BCSRMatrix {
OwnedOrBorrowedVector<uint32_t> col_indices;
OwnedOrBorrowedVector<uint32_t> row_values;
OwnedOrBorrowedVector<uint8_t> values;
uint32_t col_block_size; // input features block size
uint32_t row_block_size; // output features block size
void print() const;
Expand All @@ -56,9 +100,9 @@ std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
const uint8_t* zero_points);

std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
const int32_t* col_indices,
const int32_t* row_values,
const int8_t* values,
uint32_t* col_indices,
uint32_t* row_values,
uint8_t* values,
const int64_t col_indices_size,
const int64_t row_values_size,
const int64_t values_size,
Expand Down
66 changes: 25 additions & 41 deletions aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,17 @@ std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
assert(K > 0);
std::unique_ptr<BCSRMatrix> bcsr_mat_ptr = std::make_unique<BCSRMatrix>();
auto& bcsr_mat = *bcsr_mat_ptr;
auto& row_values = bcsr_mat.row_values.vector();
auto& col_indices = bcsr_mat.col_indices.vector();
auto& values = bcsr_mat.values.vector();

const uint32_t num_row_blocks = (N + row_block_size - 1) / row_block_size;
// K must be > 0
const uint32_t num_col_blocks = (K + col_block_size - 1) / col_block_size;

bcsr_mat.row_values.reserve(num_row_blocks);
row_values.reserve(num_row_blocks);
uint32_t num_nnz_blocks{0};
bcsr_mat.row_values.push_back(num_nnz_blocks);
row_values.push_back(num_nnz_blocks);
for (uint32_t i = 0; i < num_row_blocks; ++i) {
for (uint32_t j = 0; j < num_col_blocks; ++j) {
bool block_zero{true};
Expand All @@ -52,73 +56,53 @@ std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
}
block_scanned:
if (!block_zero) {
bcsr_mat.col_indices.push_back(j);
col_indices.push_back(j);
num_nnz_blocks++;
for (uint32_t ib = 0; ib < row_block_size; ++ib) {
uint32_t row_index = i * row_block_size + ib;
if PYTORCH_QNNP_UNLIKELY(row_index >= N) {
for (; row_index < (num_row_blocks * row_block_size); row_index++) {
for (uint32_t jb = 0; jb < col_block_size; ++jb) {
bcsr_mat.values.push_back(zero_points[N-1]);
values.push_back(zero_points[N-1]);
}
}
break;
}
for (uint32_t jb = 0; jb < col_block_size; ++jb) {
uint32_t col_index = j * col_block_size + jb;
if PYTORCH_QNNP_UNLIKELY(col_index >= K) {
bcsr_mat.values.push_back(zero_points[row_index]);
values.push_back(zero_points[row_index]);
} else {
uint8_t val = *(a + row_index * K + col_index);
bcsr_mat.values.push_back(val);
values.push_back(val);
}
}
}
}
}
bcsr_mat.row_values.push_back(num_nnz_blocks);
row_values.push_back(num_nnz_blocks);
}
bcsr_mat.row_block_size = row_block_size;
bcsr_mat.col_block_size = col_block_size;
return bcsr_mat_ptr;
}

std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
const int32_t* col_indices,
const int32_t* row_values,
const int8_t* values,
uint32_t* col_indices,
uint32_t* row_values,
uint8_t* values,
const int64_t col_indices_size,
const int64_t row_values_size,
const int64_t values_size,
const int64_t row_block_size,
const int64_t col_block_size) {
std::unique_ptr<BCSRMatrix> bcsr_mat_ptr = std::make_unique<BCSRMatrix>();
BCSRMatrix& bcsr_mat = *bcsr_mat_ptr;
const auto make_unsigned = [](int32_t v) { return static_cast<uint32_t>(v); };
const auto add_128 = [](int8_t v) {
return static_cast<uint8_t>(static_cast<int16_t>(v) + 128);
};

bcsr_mat_ptr->col_indices.reserve(col_indices_size);
bcsr_mat_ptr->row_values.reserve(row_values_size);
bcsr_mat_ptr->values.reserve(values_size);

std::transform(
col_indices,
col_indices + col_indices_size,
std::back_inserter(bcsr_mat_ptr->col_indices),
make_unsigned);
std::transform(
row_values,
row_values + row_values_size,
std::back_inserter(bcsr_mat_ptr->row_values),
make_unsigned);
std::transform(
values,
values + values_size,
std::back_inserter(bcsr_mat_ptr->values),
add_128);

bcsr_mat.col_indices =
OwnedOrBorrowedVector<uint32_t>(col_indices, col_indices_size);
bcsr_mat.row_values =
OwnedOrBorrowedVector<uint32_t>(row_values, row_values_size);
bcsr_mat.values = OwnedOrBorrowedVector<uint8_t>(values, values_size);
bcsr_mat.row_block_size = row_block_size;
bcsr_mat.col_block_size = col_block_size;
return bcsr_mat_ptr;
Expand All @@ -128,18 +112,18 @@ void BCSRMatrix::print() const {
std::cout << "row block size:" << row_block_size << std::endl;
std::cout << "col block size:" << col_block_size << std::endl;
std::cout << "row ptr\n";
for (const auto& t : row_values) {
std::cout << t << ", ";
for (int i = 0; i < row_values.size(); i++) {
std::cout << row_values[i] << ", ";
}
std::cout << std::endl;
std::cout << "col indices\n";
for (const auto& t : col_indices) {
std::cout << t << ", ";
for (int i = 0; i < col_indices.size(); i++) {
std::cout << col_indices[i] << ", ";
}
std::cout << std::endl;
std::cout << "Actual values\n";
for (const auto& t : values) {
std::cout << (uint32_t)t << ", ";
for (int i = 0; i < values.size(); i++) {
std::cout << (uint32_t)values[i] << ", ";
}
std::cout << std::endl;
}
Expand Down
2 changes: 1 addition & 1 deletion test/ao/sparsity/test_qlinear_packed_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_qlinear_packed_params(self, allow_non_zero_zero_points=False):
self.assertEqual(quantization_scheme_, is_per_tensor_quantized)
self.assertEqual(row_block_indices_, expected_row_block_indices)
self.assertEqual(col_block_indices_, expected_col_block_indices)
self.assertEqual(weights_.tolist(), expected_weights)
self.assertEqual(weights_.tolist(), [v + 128 for v in expected_weights]) # weights are serialized as +128
self.assertEqual(output_channels_, weight.shape[0])
self.assertEqual(input_channels_, weight.shape[1])

Expand Down

0 comments on commit 98e4524

Please sign in to comment.