forked from pytorch/FBGEMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFbgemmSparse.h
224 lines (199 loc) · 6.23 KB
/
FbgemmSparse.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <cstdint>
#include <functional>
#include <memory>
#include <vector>
#include "fbgemm/FbgemmBuild.h"
#include "fbgemm/UtilsAvx2.h"
#include "fbgemm/spmmUtilsAvx2.h"
namespace fbgemm {
template <typename T>
struct FBGEMM_API CSRMatrix {
std::vector<int> rowPtr;
std::vector<int> colIdx;
std::vector<T> values;
};
/**
* Tiled block CSR format
* Partial blocks are zero-filled
*
*/
template <typename T = std::int8_t, int ROW_BLOCK = 1, int COL_BLOCK = 4>
struct FBGEMM_API BCSRMatrix {
using DTYPE = T;
static constexpr int RB = ROW_BLOCK; // Block size for rows
static constexpr int CB = COL_BLOCK; // Block size for cols
// We only tile in column dimension currently
// COLTILE must be a multiple of COL_BLOCK
static constexpr int COLTILE = 4000;
std::vector<int> rowBPtr; // rowPtr for blocks
std::vector<int> colBIdx; // colIdx for blocks
std::vector<DTYPE> values;
// Sum of all elements in a row
std::vector<int32_t> row_offsets;
int R;
int C;
BCSRMatrix(int Rows, int Cols) {
R = Rows;
C = Cols;
row_offsets.resize(R, 0);
}
/**
* @brief pack from dense to tiled block CSR format
* @param R number of rows in the matrix
* @param C number of columns in the matrix
* @param src is the source matrix with data type DTYPE
* @param ld is the leading dimension
*/
void pack(const DTYPE* src, size_t ld);
/**
* @brief pack from dense to tiled block CSR format
* @param R number of rows in the matrix
* @param C number of columns in the matrix
* @param src is the source matrix with data type DTYPE
*
* leading dim of the matrix is assumed to be equal to C
*/
void pack(const DTYPE* src);
/**
* @brief unpack from tiled block CSR to dense
* @param dst should be able to hold R*C elements of type DTYPE
* @param ld is the leading dimension
*/
void unpack(DTYPE* dst, size_t ld);
/*
* @brief unpack from tiled block CSR to dense
* @param dst should be able to hold R*C elements of type DTYPE
*
* leading dimension of the matrix is assumed to be equal to C
*/
void unpack(DTYPE* dst);
};
template <typename T>
FBGEMM_API std::unique_ptr<CSRMatrix<T>>
fbgemmDenseToCSR(int R, int C, const T* inp, int ld);
template <typename T>
FBGEMM_API std::unique_ptr<CSRMatrix<T>>
fbgemmDenseToCSR(int R, int C, const T* inp);
template <typename T = std::int8_t, int RB = 1, int CB = 4>
FBGEMM_API std::unique_ptr<BCSRMatrix<T, RB, CB>>
fbgemmDenseToBCSR(int R, int C, const T* inp, int ld);
template <typename T = std::int8_t, int RB = 1, int CB = 4>
FBGEMM_API std::unique_ptr<BCSRMatrix<T, RB, CB>>
fbgemmDenseToBCSR(int R, int C, const T* inp);
/**
* @param accum Controls accumulation.
* 1 means we're accumulating to the C Matrix.
*
* Note on matrix order and layout:
* Unlike other fbgemm functions that follow PyTorch convention where A
* matrix is activation (so in uint8_t for quantized FC/Conv or fp32) and B
* matrix is weight (so in int8_t for quantized FC/Conv or fp32), here A is
* weight matrix. This is because we mostly target sparsity in weights and for
* row-major layout it's more efficient to have A as a sparse matrix: for each
* non-zero of A at ith row and kth column, we can access kth row of B, whose
* elements are contiguous in memory. If B matrix was sparse, for each non-zero
* of B at kth row and jth column, we would've needed to access kth column of A,
* whose elements are not contiguous in memory with C/C++'s row-major layout.
* Alternatively, we can call this function as if we're computing
* C^T = B^T * A^T while maintaining PyTorch's convention that the lefthand
* side matrix B is activation. If B matrix is in column-major layout, we don't
* need to do an extra transposition. The C matrix will be output in
* column-major layout, so if we have a back-to-back Sparse-Dense matrix-matrix
* multiplications, B matrices of subsequent matrices will be already in
* column-major layout. Refer to SparseDenseMMFP32Benchmark.cc for an example.
*
*/
FBGEMM_API void SparseDenseMM(
int M,
int N,
const int* row_ptr,
const int* col_idx,
const float* values,
const float* B,
int ldb,
float* C,
int ldc,
bool accum = false);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
FBGEMM_API void fbgemmSparseDenseInt8MM(
int N,
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
int ldc,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
namespace internal {
void SparseDenseMMAvx2(
int M,
int N,
const int* row_ptr,
const int* col_idx,
const float* values,
const float* B,
int ldb,
float* C,
int ldc,
bool accum = false);
void SparseDenseMMAvx512(
int M,
int N,
const int* row_ptr,
const int* col_idx,
const float* values,
const float* B,
int ldb,
float* C,
int ldc,
bool accum = false);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
void SparseDenseInt8MMAvx2(
int N,
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
int ldc,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
void SparseDenseInt8MMAvx512(
int N,
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
int ldc,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
template <bool FUSE_RELU, QuantizationGranularity Q_GRAN>
void SparseDenseInt8MVAvx512(
const std::unique_ptr<BCSRMatrix<>>& bcsr,
const uint8_t* B,
int ldb,
int32_t* C_i32,
uint8_t* C_u8,
trRequantizationParams_t& rParams,
bool accum = false,
int thread_id = 0,
int num_threads = 1);
} // namespace internal
} // namespace fbgemm