forked from Tencent/ncnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu.h
354 lines (282 loc) · 12.5 KB
/
gpu.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#ifndef NCNN_GPU_H
#define NCNN_GPU_H
#include "platform.h"
#if NCNN_VULKAN
#include "mat.h"
#include <vulkan/vulkan.h>
namespace ncnn {
// instance
int create_gpu_instance();
void destroy_gpu_instance();
// instance extension capability
extern int support_VK_KHR_external_memory_capabilities;
extern int support_VK_KHR_get_physical_device_properties2;
extern int support_VK_KHR_get_surface_capabilities2;
extern int support_VK_KHR_surface;
extern int support_VK_EXT_debug_utils;
#if __ANDROID_API__ >= 26
extern int support_VK_KHR_android_surface;
#endif // __ANDROID_API__ >= 26
// VK_KHR_external_memory_capabilities
extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
// VK_KHR_get_physical_device_properties2
extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
// VK_KHR_get_surface_capabilities2
extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
// VK_KHR_surface
extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
#if __ANDROID_API__ >= 26
// VK_KHR_android_surface
extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
#endif // __ANDROID_API__ >= 26
// get info
int get_gpu_count();
int get_default_gpu_index();
class GpuInfo
{
public:
// vulkan physical device
VkPhysicalDevice physical_device;
// memory properties
VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties;
// info
uint32_t api_version;
uint32_t driver_version;
uint32_t vendor_id;
uint32_t device_id;
uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
// 0 = discrete gpu
// 1 = integrated gpu
// 2 = virtual gpu
// 3 = cpu
int type;
// hardware limit
uint32_t max_shared_memory_size;
uint32_t max_workgroup_count[3];
uint32_t max_workgroup_invocations;
uint32_t max_workgroup_size[3];
size_t memory_map_alignment;
size_t buffer_offset_alignment;
size_t non_coherent_atom_size;
size_t buffer_image_granularity;
uint32_t max_image_dimension_1d;
uint32_t max_image_dimension_2d;
uint32_t max_image_dimension_3d;
float timestamp_period;
// runtime
uint32_t compute_queue_family_index;
uint32_t graphics_queue_family_index;
uint32_t transfer_queue_family_index;
uint32_t compute_queue_count;
uint32_t graphics_queue_count;
uint32_t transfer_queue_count;
// property
bool unified_compute_transfer_queue;
// bug is not feature
bool bug_storage_buffer_no_l1;
bool bug_layout_binding_id_alias;
bool bug_corrupted_online_pipeline_cache;
// but sometimes bug is a feature
bool bug_implicit_fp16_arithmetic;
// fp16 and int8 feature
bool support_fp16_packed;
bool support_fp16_storage;
bool support_fp16_arithmetic;
bool support_int8_storage;
bool support_int8_arithmetic;
// ycbcr conversion feature
bool support_ycbcr_conversion;
// extension capability
int support_VK_KHR_8bit_storage;
int support_VK_KHR_16bit_storage;
int support_VK_KHR_bind_memory2;
int support_VK_KHR_dedicated_allocation;
int support_VK_KHR_descriptor_update_template;
int support_VK_KHR_external_memory;
int support_VK_KHR_get_memory_requirements2;
int support_VK_KHR_maintenance1;
int support_VK_KHR_push_descriptor;
int support_VK_KHR_sampler_ycbcr_conversion;
int support_VK_KHR_shader_float16_int8;
int support_VK_KHR_shader_float_controls;
int support_VK_KHR_storage_buffer_storage_class;
int support_VK_KHR_swapchain;
int support_VK_EXT_memory_budget;
int support_VK_EXT_queue_family_foreign;
#if __ANDROID_API__ >= 26
int support_VK_ANDROID_external_memory_android_hardware_buffer;
#endif // __ANDROID_API__ >= 26
};
const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
class VkAllocator;
class VkCompute;
class Layer;
class Packing_vulkan;
class Option;
class PipelineCache;
class VulkanDevice
{
public:
VulkanDevice(int device_index = get_default_gpu_index());
~VulkanDevice();
const GpuInfo& info;
VkDevice vkdevice() const
{
return device;
}
#if !NCNN_VULKAN_ONLINE_SPIRV
// with fixed workgroup size
VkShaderModule create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
#endif
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
// with fixed workgroup size
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
// helper for creating pipeline
int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
bool is_mappable(uint32_t memory_type_index) const;
bool is_coherent(uint32_t memory_type_index) const;
VkQueue acquire_queue(uint32_t queue_family_index) const;
void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
// allocator on this device
VkAllocator* acquire_blob_allocator() const;
void reclaim_blob_allocator(VkAllocator* allocator) const;
VkAllocator* acquire_staging_allocator() const;
void reclaim_staging_allocator(VkAllocator* allocator) const;
// immutable sampler for texelfetch
const VkSampler* immutable_texelfetch_sampler() const;
// dummy buffer image
VkMat get_dummy_buffer() const;
VkImageMat get_dummy_image() const;
// pipeline cache on this device
const PipelineCache* get_pipeline_cache() const;
// test image allocation
bool shape_support_image_storage(const Mat& shape) const;
// current gpu heap memory budget in MB
uint32_t get_heap_budget() const;
// utility operator
void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
// VK_KHR_bind_memory2
PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
// VK_KHR_descriptor_update_template
PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
// VK_KHR_get_memory_requirements2
PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
// VK_KHR_maintenance1
PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
// VK_KHR_push_descriptor
PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
// VK_KHR_sampler_ycbcr_conversion
PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
// VK_KHR_swapchain
PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
PFN_vkQueuePresentKHR vkQueuePresentKHR;
#if __ANDROID_API__ >= 26
// VK_ANDROID_external_memory_android_hardware_buffer
PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
#endif // __ANDROID_API__ >= 26
protected:
// device extension
int init_device_extension();
// dummy buffer and image
int create_dummy_buffer_image();
void destroy_dummy_buffer_image();
// utility operator
const ncnn::Packing_vulkan* get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
void destroy_utility_operator();
private:
VkDevice device;
// hardware queue
mutable std::vector<VkQueue> compute_queues;
mutable std::vector<VkQueue> graphics_queues;
mutable std::vector<VkQueue> transfer_queues;
mutable Mutex queue_lock;
// default blob allocator for each queue
mutable std::vector<VkAllocator*> blob_allocators;
mutable Mutex blob_allocator_lock;
// default staging allocator for each queue
mutable std::vector<VkAllocator*> staging_allocators;
mutable Mutex staging_allocator_lock;
// nearest sampler for texelfetch
VkSampler texelfetch_sampler;
// dummy buffer and image
VkAllocator* dummy_allocator;
VkMat dummy_buffer;
VkImageMat dummy_image;
// device-wide pipeline cache
PipelineCache* pipeline_cache;
// utility operator
// from buffer | image
// to buffer | image
// from fp32-b/i | fp16p-b/i | fp16s-b/i
// to fp32-b/i | fp16p-b/i | fp16s-b/i
// to pack1 | pack4 | pack8
mutable ncnn::Packing_vulkan* uop_packing[2][2][3][3][3];
mutable Mutex uop_lock;
};
VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
#if NCNN_VULKAN_ONLINE_SPIRV
// online spirv compilation
int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
#endif
// info from spirv
class ShaderInfo
{
public:
int specialization_count;
int binding_count;
int push_constant_count;
// 0 = null
// 1 = storage buffer
// 2 = storage image
// 3 = combined image sampler
int binding_types[16]; // 16 is large enough I think ...
};
#if !NCNN_VULKAN_ONLINE_SPIRV
const ShaderInfo& get_shader_info(int shader_type_index);
#endif
int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
} // namespace ncnn
#endif // NCNN_VULKAN
#endif // NCNN_GPU_H