forked from tensorflow/tfjs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebgpu_util.ts
185 lines (166 loc) · 5.98 KB
/
webgpu_util.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/**
* @license
* Copyright 2019 Google LLC. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
*/
import {DataType, TensorInfo, util} from '@tensorflow/tfjs-core';
const arrayProduct = (arr: number[]) => {
let product = 1;
for (let i = 0; i < arr.length; i++) {
product *= arr[i];
}
return product;
};
export function tilesFitEvenlyIntoShape(
tileSize: number[], shape: number[]): boolean {
if (tileSize.length !== shape.length) {
throw new Error(
`Cannot compute whether rank ${tileSize.length}` +
` tiles fit evenly into rank ${shape.length} shape` +
` - ranks must match.`);
}
return shape.every(
(dim: number, dimIdx: number) => dim % tileSize[dimIdx] === 0);
}
// Computes dispatch geometry based on layout of output dimensions and
// workgroupSize.
export function computeDispatch(
layout: {x: number[], y?: number[], z?: number[]}, outputShape: number[],
workgroupSize: [number, number, number] = [1, 1, 1],
elementsPerThread: [number, number, number] =
[1, 1, 1]): [number, number, number] {
const [dispatchX, dispatchY, dispatchZ] = [
Math.ceil(
arrayProduct(layout.x.map(d => outputShape[d])) /
(workgroupSize[0] * elementsPerThread[0])),
layout.y ? Math.ceil(
arrayProduct(layout.y.map(d => outputShape[d])) /
(workgroupSize[1] * elementsPerThread[1])) :
1,
layout.z ? Math.ceil(
arrayProduct(layout.z.map(d => outputShape[d])) /
(workgroupSize[2] * elementsPerThread[2])) :
1
];
return [dispatchX, dispatchY, dispatchZ];
}
export type WorkgroupInfo = {
workgroupSize: [number, number, number],
elementsPerThread: [number, number, number],
};
export function computeWorkgroupInfoForMatMul(
dimAOuter: number, dimInner: number, dimBOuter: number,
transposeA = false): WorkgroupInfo {
// These are experimental values. Usually, we need to adjust the work group
// size based on the input shapes to improve the EU occupancy.
// TODO: WebGPU limits the maximum allowed shared memory size as 16K. To make
// sure it doesn't exceed this limitations. Temporarily reduce the work group
// size to [8, 8, 1] and the work per thread size is [4, 4, 1]. But we should
// revisit it and find the balance between work group size and work per thread
// size.
const workgroupSize: [number, number, number] = [8, 8, 1];
const elementsPerThread: [number, number, number] = [4, 4, 1];
if (!transposeA) {
if (dimAOuter <= 8) {
elementsPerThread[1] = 1;
}
if (dimInner <= 16 && dimBOuter <= 16) {
workgroupSize[0] = 4;
}
}
return {workgroupSize, elementsPerThread};
}
export function computeWorkgroupSizeForConv2d(
layout: {x: number[], y?: number[], z?: number[]}, outputShape: number[],
isVec4 = false): [number, number, number] {
if (isVec4) {
return [8, 8, 1];
}
const dim0 = arrayProduct(layout.x.map(d => outputShape[d]));
const dim1 = arrayProduct(layout.y.map(d => outputShape[d]));
// TODO([email protected]): More fine tune based on outputShape.
// These are experimental values. Usually, we need to adjust the work group
// size based on the output shape. For example, when one dimension is smaller
// than 4, it will be wasteful if we assign a larger size for this dimension,
// which results lots of threads doing useless work and reduces parallelism
// of hardware threads. But it is always a balance between work group size
// and shared memory. If one dimension is too small, such as 1, shared memory
// will won't be fully utilized.
if (dim0 <= 4) {
return [4, 16, 1];
}
if (dim1 <= 4) {
return [16, 4, 1];
}
return [16, 16, 1];
}
export function computeWorkPerThreadForConv2d(
layout: {x: number[], y?: number[], z?: number[]}, outputShape: number[],
isVec4 = false): [number, number, number] {
if (isVec4) {
return [4, 4, 1];
}
const dim0 = arrayProduct(layout.x.map(d => outputShape[d]));
const dim1 = arrayProduct(layout.y.map(d => outputShape[d]));
// TODO([email protected]): More fine tune based on outputShape.
// The following conditions correspond to the values set in
// computeWorkgroupSizeForConv2d.
if (dim0 <= 4) {
return [1, 2, 1];
}
if (dim1 <= 4) {
return [2, 1, 1];
}
return [2, 2, 1];
}
export function flatDispatchLayout(shape: number[]) {
return {x: shape.map((d, i) => i)};
}
export function GPUBytesPerElement(dtype: DataType): number {
if (dtype === 'float32' || dtype === 'int32' || dtype === 'bool' ||
dtype === 'string') {
return 4;
} else if (dtype === 'complex64') {
return 8;
} else {
throw new Error(`Unknown dtype ${dtype}`);
}
}
export function isWebGPUSupported(): boolean {
return ((typeof window !== 'undefined') ||
//@ts-ignore
(typeof WorkerGlobalScope !== 'undefined')) &&
!!navigator.gpu;
}
export function assertNotComplex(
tensor: TensorInfo|TensorInfo[], opName: string): void {
if (!Array.isArray(tensor)) {
tensor = [tensor];
}
tensor.forEach(t => {
if (t != null) {
util.assert(
t.dtype !== 'complex64',
() => `${opName} does not support complex64 tensors ` +
'in the WebGPU backend.');
}
});
}
export enum MatMulProgramType {
MatMulReduceProgram,
MatMulSplitKProgram,
MatMulSmallOutputSizeProgram,
MatMulPackedProgram,
MatMulMax
}