-
Notifications
You must be signed in to change notification settings - Fork 511
/
Copy patharm_vela.py
125 lines (100 loc) · 4.6 KB
/
arm_vela.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright 2023-2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
# pyre-unsafe
import os
import struct
import tempfile
from typing import List
import numpy as np
try:
from ethosu.vela import vela # type: ignore
has_vela = True
except ImportError:
has_vela = False
# Pack either input or output tensor block, compose the related arrays into
# per-io structs to simplify runtime use.
def vela_bin_pack_io(prefix, data, shape_order=None):
vela_input_shapes = data[prefix + "_shape"]
order = shape_order if shape_order else range(len(vela_input_shapes))
ios = struct.pack("<i", len(vela_input_shapes))
for i in order:
io_shape = vela_input_shapes[i]
io_elem_size = data[prefix + "_elem_size"][i]
io_offset = data[prefix + "_offset"][i]
io_region = data[prefix + "_region"][i]
assert len(io_shape) <= 4
inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
io_struct = struct.pack(
"<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
)
ios += io_struct
return ios
# Output via Vela to binary stream for ArmBackendEthosU
# WARNING: Do not change this without changing VelaBinStream.cpp as that
# function consumes this format and the two need to align.
def vela_compile(
tosa_flatbuffer: bytes, args: List[str], shape_order=None, verbose: bool = False
):
"""
Compile a TOSA graph to a binary stream for ArmBackendEthosU using Vela.
"""
if not has_vela:
raise RuntimeError(
"ethos-u-vela pip package couldn't be imported. Make sure it's installed!"
)
with tempfile.TemporaryDirectory() as tmpdir:
tosaname = "out.tosa"
tosa_path = os.path.join(tmpdir, tosaname)
with open(tosa_path, "wb") as f:
f.write(tosa_flatbuffer)
# invoke vela
output_dir = os.path.join(tmpdir, "output")
args.append(f"--output-dir={output_dir}")
args.append(tosa_path)
if verbose:
args.append("--verbose-all")
vela.main(" ".join(args).split(" "))
if any("ethos-u85" in arg for arg in args) or any(
"debug-force-regor" in arg for arg in args
):
np_path = os.path.join(tmpdir, "output", "out_vela.npz")
else:
np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
blocks = b""
with np.load(np_path, allow_pickle=False) as data:
# Construct our modified output_blocks with data in a form easily
# digested on the device side
bin_blocks = {"vela_bin_stream": b""}
# copy command data through unmodified
bin_blocks["cmd_data"] = data["cmd_data"].tobytes()
# copy weight data through unmodified
bin_blocks["weight_data"] = data["weight_data"].tobytes()
# Add a block for scratch, inputs and outputs; scratch shape is a 1 element
# array giving us size in bytes so extract this and add a block of 0's.
# Currently we preallocated this on the host to provide SRAM for computation.
if not isinstance(data["scratch_shape"][0], np.int64):
raise RuntimeError("Expected scratch to be int64")
block_length = int(data["scratch_shape"][0])
bin_blocks["scratch_data"] = b"\x00" * block_length
# Capture inputs and outputs
bin_blocks["inputs"] = vela_bin_pack_io("input", data, shape_order)
bin_blocks["outputs"] = vela_bin_pack_io("output", data)
bin_blocks["vela_end_stream"] = b""
# Emit the NPZ regions as:
# - 16 byte block name null terminated string (padded to 16 if name shorter)
# - 4 bytes of int32 block length and 12 bytes of 0's
# - block data (padded to 16 byte alignment at end)
# Repeat for all blocks
for key in bin_blocks.keys():
block_name = bytes(key, "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))
# We need the acual unpadded block lengths for hw setup
block_length_bytes = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)
# Pad block data to multiple of 16 bytes
block_data = bin_blocks[key]
block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
block = block_name + block_length_bytes + block_data
blocks = blocks + block
return blocks