forked from tenstorrent/tt-metal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhost_api.hpp
387 lines (347 loc) · 28 KB
/
host_api.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include <optional>
#include <variant>
#include <vector>
#include "common/core_coord.h"
#include "tt_metal/impl/program/program.hpp"
#include "tt_metal/impl/buffers/buffer.hpp"
/** @file */
/** \mainpage tt-metal Internal C++ Documentation
*
* Welcome. Please navigate using the Files menu. All APIs are documented
* under the files listed in the Files menu.
*
* If you want to contribute to the documentation and are looking for a good
* resource for generating Markdown tables, refer to
* https://www.tablesgenerator.com/markdown_tables
* */
namespace tt {
namespace tt_metal {
class Program;
class Host;
class Device;
class CommandQueue;
class Trace;
class CircularBuffer;
// ==================================================
// HOST API: Device management
// ==================================================
/**
* Returns number of Tenstorrent devices that can be targeted
*
* Return value: size_t
*/
size_t GetNumAvailableDevices();
/**
* Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted
*
* Return value: size_t
*/
size_t GetNumPCIeDevices();
/**
* Instantiates a device object.
*
* Return value: Device *
*
* | Argument | Description | Type | Valid Range | Required |
* |------------|----------------------------|-----------------|-----------------------------------|----------|
* | device_id | ID of the device to target| chip_id_t (int) | 0 to (GetNumAvailableDevices - 1) | Yes |
* */
Device *CreateDevice(chip_id_t device_id, const uint8_t num_hw_cqs = 1, const std::vector<uint32_t>& l1_bank_remap = {});
/**
* Resets device and closes device
*
* Return value: bool
*
* | Argument | Description | Type | Valid Range | Required |
* |----------|----------------------------|----------|-------------|----------|
* | device | Pointer to a device object | Device * | | True |
*/
bool CloseDevice(Device *device);
// ==================================================
// HOST API: program & kernels
// ==================================================
/**
* Creates a Program object which is the main container that bundles kernels, circular buffers, and/or semaphores for execution on device
*
* Return value: Program
*/
Program CreateProgram();
/**
* Creates a data movement kernel with no compile time arguments and adds it to the program.
*
* Return value: Kernel ID (uintptr_t)
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|--------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|-------------|----------|
* | program | The program to which this kernel will be added to | Program & | | Yes |
* | file_name | Path to kernel src | const std::string & | | Yes |
* | core_spec | Either a single logical core, a range of logical cores or a set of logical core ranges that indicate which cores kernel is placed on | const std::variant<CoreCoord, CoreRange, CoreRangeSet> & | | Yes |
* | config | Config for data movement or compute kernel | const std::variant<DataMovementConfig,ComputeConfig,EthernetConfig> & | | No |
*/
KernelHandle CreateKernel(Program &program, const std::string &file_name, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, const std::variant<DataMovementConfig,ComputeConfig,experimental::EthernetConfig> & config);
// ==================================================
// HOST API: buffers
// ==================================================
/**
* Creates a Circular Buffer (CB) in L1 memory of all cores within core ranges (inclusive) and adds it to the program. There can be a total of NUM_CIRCULAR_BUFFERS (32) circular buffers per core.
* Circular buffers hold data and have an associated config which indicates usage of the address space.
* If the config is specified for multiple buffer indices, the circular buffer address space is shared and each buffer index can potentially have a unique view of the shared space.
*
* Circular buffers can be dynamically allocated or program-local allocated. If the config is created with an L1 buffer or sets a globally allocated address it is dynamic and shares the same address space as the L1 buffer.
* Otherwise, the circular buffer address space is managed by the program. Address space for program-local circular buffers does not persist across programs.
*
* Return value: Circular Buffer ID (uintptr_t)
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------|---------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|-------------|----------|
* | program | The program to which buffer will be added to | Program & | | Yes |
* | core_spec | Either a single logical core, a range of logical cores or a set of logical core ranges that indicate where the circular buffer will be configured | const std::variant<CoreCoord, CoreRange, CoreRangeSet> & | | Yes |
* | config | Config for circular buffer | const CircularBufferConfig & | | Yes |
*/
CBHandle CreateCircularBuffer(Program &program, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, const CircularBufferConfig &config);
/**
* Gets a reference to the config owned by circular buffer at the given circular buffer ID.
*
* Return value: const CircularBufferConfig &
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------|----------------------------------------------------------------|------------------------------|-------------|----------|
* | program | The program containing the circular buffer | Program & | | Yes |
* | cb_handle | ID of the circular buffer, returned by `CreateCircularBuffers` | CBHandle (uintptr_t) | | Yes |
*/
const CircularBufferConfig &GetCircularBufferConfig(Program &program, CBHandle cb_handle);
/**
* Update the total size of the circular buffer at the given circular buffer handle. Updating a program-local circular buffer requires all circular buffers in the program to be reallocated.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |------------|----------------------------------------------------------------|------------------------------|-------------|----------|
* | program | The program containing the circular buffer | Program & | | Yes |
* | cb_handle | ID of the circular buffer, returned by `CreateCircularBuffers` | CBHandle (uintptr_t) | | Yes |
* | total_size | New size of the circular buffer in bytes | uint32_t | | Yes |
*/
void UpdateCircularBufferTotalSize(Program &program, CBHandle cb_handle, uint32_t total_size);
/**
* Update the page size at specified `buffer_index` of the circular buffer at the given circular buffer handle.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|----------------------------------------------------------------------------------------------------------------------------|------------------------------|-------------------------------|----------|
* | program | The program containing the circular buffer | Program & | | Yes |
* | cb_handle | ID of the circular buffer, returned by `CreateCircularBuffers` | CBHandle (uintptr_t) | | Yes |
* | buffer_index | Circular buffer index to update page size. `cb_handle` must be a circular buffer that had previously programmed this index | uint8_t | 0 to NUM_CIRCULAR_BUFFERS - 1 | Yes |
* | page_size | Updated page size in bytes | uint32_t | | Yes |
*/
void UpdateCircularBufferPageSize(Program &program, CBHandle cb_handle, uint8_t buffer_index, uint32_t page_size);
/**
* Update the address of a dynamic circular buffer. Dynamic circular buffers share the same address space as L1 buffers.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------|------------------------------------------------------------------------------------------|------------------------------|-------------|----------|
* | program | The program containing the circular buffer | Program & | | Yes |
* | cb_handle | ID of the circular buffer, returned by `CreateCircularBuffers` | CBHandle (uintptr_t) | | Yes |
* | buffer | Dynamically allocated L1 buffer that shares address space of circular buffer `cb_handle` | const Buffer & | L1 buffer | Yes |
*/
void UpdateDynamicCircularBufferAddress(Program &program, CBHandle cb_handle, const Buffer &buffer);
/**
* Initializes semaphore on all cores within core range (inclusive). Each core can have up to four 32B semaphores.
*
* Return value: Semaphore address (uint32_t)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|------------------------------------------------------|-----------------------------------------------------------|--------------|----------|
* | program | The program to which semaphore will be added to | Program & | | Yes |
* | core_spec | Range of the Tensix co-ordinates using the semaphore | const std::variant<CoreRange,CoreRangeSet> & | | Yes |
* | initial_value | Initial value of the semaphore | uint32_t | | Yes |
*/
uint32_t CreateSemaphore(Program &program, const std::variant<CoreRange,CoreRangeSet> &core_spec, uint32_t initial_value);
/**
* Allocates a DRAM or L1 buffer on device
*
* Return value: Buffer
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | config for buffer | BufferConfig | | Yes |
*/
Buffer CreateBuffer(const std::variant<InterleavedBufferConfig, ShardedBufferConfig> & config);
/**
* Deallocates buffer from device by marking its memory as free.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |----------|--------------------------------------|----------|-------------|----------|
* | buffer | The buffer to deallocate from device | Buffer & | | Yes |
*/
void DeallocateBuffer(Buffer &buffer);
// ==================================================
// COMPILE & EXECUTE KENRNELS
// ==================================================
/**
* Set runtime args for a kernel that are sent to the core during runtime. This API needs to be called to update the runtime args for the kernel.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|--------------------------------------------------------|---------------------------------------------------------------------|----------|
* | program | The program containing kernels, circular buffers, semaphores | const Program & | | Yes |
* | kernel_id | ID of the kernel that will receive the runtime args | KernelHandle (uint64_t) | | Yes |
* | core_spec | Location of Tensix core(s) where the runtime args will be written | const std::variant<CoreCoord,CoreRange,CoreRangeSet> & | Any logical Tensix core coordinate(s) on which the kernel is placed | Yes |
* | runtime_args | The runtime args to be written | const std::vector<uint32_t> & | | Yes |
*/
void SetRuntimeArgs(const Program &program, KernelHandle kernel, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, const std::vector<uint32_t> &runtime_args);
/**
* Set multiple runtime arguments of a kernel at once during runtime, each mapping to a specific core. The runtime args for each core may be unique.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|--------------------------------------------------------|----------------------------------------------------------------------------|----------|
* | program | The program containing kernels, circular buffers, semaphores | const Program & | | Yes |
* | kernel_id | ID of the kernel that will receive the runtime args | KernelHandle (uint64_t) | | Yes |
* | core_spec | Location of Tensix core(s) where the runtime args will be written | const std::vector<CoreCoord> & | Any set of logical Tensix core coordinates on which the kernel is placed | Yes |
* | runtime_args | The runtime args to be written | const std::vector< vector<uint32_t> > & | outer vector size must be equal to size of core_spec vector | Yes |
*/
void SetRuntimeArgs(const Program &program, KernelHandle kernel, const std::vector< CoreCoord > & core_spec, const std::vector< std::vector<uint32_t> > &runtime_args);
/**
* Get the runtime args for a kernel.
*
* Return value: std::vector<uint32_t> &
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | program | The program containing kernels, circular buffers, semaphores | const Program & | | Yes |
* | kernel_id | ID of the kernel that will receive the runtime args | KernelHandle (uint64_t) | | Yes |
* | logical_core | The location of the Tensix core where the runtime args will be written | const CoreCoord & | Any logical Tensix core coordinate | Yes |
*/
std::vector<uint32_t>& GetRuntimeArgs(const Program &program, KernelHandle kernel_id, const CoreCoord &logical_core);
/**
* Reads a buffer from the device
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|----------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
* | buffer | The device buffer we are reading from | Buffer & | | Yes |
* | dst | The vector where the results that are read will be stored | vector<uint32_t> & | | Yes |
* | blocking | Whether or not this is a blocking operation | bool | Only blocking mode supported currently | Yes |
*/
void EnqueueReadBuffer(CommandQueue& cq, Buffer& buffer, vector<uint32_t>& dst, bool blocking);
/**
* Reads a buffer from the device
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|----------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
* | buffer | The device buffer we are reading from | Buffer & | | Yes |
* | dst | The memory where the result will be stored | void* | | Yes |
* | blocking | Whether or not this is a blocking operation | bool | Only blocking mode supported currently | Yes |
*/
void EnqueueReadBuffer(CommandQueue& cq, Buffer& buffer, void* dst, bool blocking);
/**
* Writes a buffer to the device
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
* | buffer | The device buffer we are writing to | Buffer & | | Yes |
* | src | The vector we are writing to the device | vector<uint32_t> & | | Yes |
* | blocking | Whether or not this is a blocking operation | bool | | Yes |
*/
void EnqueueWriteBuffer(CommandQueue& cq, Buffer& buffer, vector<uint32_t>& src, bool blocking);
/**
* Writes a buffer to the device
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
* | buffer | The device buffer we are writing to | Buffer & | | Yes |
* | src | The memory we are writing to the device | const void* | | Yes |
* | blocking | Whether or not this is a blocking operation | bool | | Yes |
*/
void EnqueueWriteBuffer(CommandQueue& cq, Buffer& buffer, const void* src, bool blocking);
/**
* Writes a program to the device and launches it
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
* | program | The program that will be executed on the device that cq is bound to | Program & | | Yes |
* | blocking | Whether or not this is a blocking operation | bool | | Yes |
* | trace | The trace object which represents the history of previously issued | optional<reference_wrapper<Trace>> | | Yes |
* | | commands | | | |
*/
void EnqueueProgram(CommandQueue& cq, Program& program, bool blocking, std::optional<std::reference_wrapper<Trace>> trace = {});
/**
* Blocks until all previously dispatched commands on the device have completed
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
*/
void Finish(CommandQueue& cq);
/**
* Creates a trace object which can be used to record commands that have been run. This
* trace can later be replayed without the further need to create more commands.
* Return value: trace
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | cq | The command queue object which dispatches the command to the hardware | CommandQueue & | | Yes |
*/
Trace BeginTrace(CommandQueue& cq);
/**
* This completes a trace and allows it to be replayed. WARNING: Once a trace has been
* completed for a given command queue, the command queue can no longer be used in eager
* mode (the default, non tracing mode). This would be undefined behaviour.
* Return value: void
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | trace | The trace object which represents the history of previously issued | Trace & | | Yes |
* | | commands | | | |
*/
void EndTrace(Trace& trace);
/**
* Enqueues a trace of previously generated commands and data.
* Return value: void
* | Argument | Description | Type | Valid Range | Required |
* |--------------|------------------------------------------------------------------------|-------------------------------|------------------------------------|----------|
* | trace | The trace object which represents the history of previously issued | CommandQueue & | | Yes |
* | | commands | | | |
* | blocking | Whether or not this is a blocking operation | bool | | Yes |
*/
void EnqueueTrace(Trace& trace, bool blocking);
/**
* Read device side profiler data and dump results into device side CSV log
*
* This function only works in PROFILER builds. Please refer to the "Device Program Profiler" section for more information.
*
* Return value: void
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|---------------------------|----------|
* | device | The device holding the program being profiled. | Device * | | True |
* | program | The program being profiled. | const Program & | | True |
* */
void DumpDeviceProfileResults(Device *device, const Program &program);
} // namespace tt_metal
} // namespace tt