forked from leoxiaobin/deep-high-resolution-net.pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference_trt.py
90 lines (73 loc) · 3.36 KB
/
inference_trt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import pycuda.autoinit
def allocate_buffers(engine, batch_size, data_type):
"""
This is the function to allocate buffers for input and output in the device
Args:
engine : The path to the TensorRT engine.
batch_size : The batch size for execution time.
data_typedef parse_args():
parser = argparse.ArgumentParser(description='Train keypoints network')
# general
parser.add_argument('--cfg', type=str, required=True)
parser.add_argument('--videoFile', type=str, required=True)
parser.add_argument('--outputDir', type=str, default='/output/')
parser.add_argument('--inferenceFps', type=int, default=10)
parser.add_argument('--writeBoxFrames', action='store_true')
parser.add_argument('opts',
help='Modify config options using the command-line',
default=None,
nargs=argparse.REMAINDER)
args = parser.parse_args(): The type of the data for input and output, for example trt.float32.
Output:
h_input_1: Input in the host.
d_input_1: Input in the device.
h_output_1: Output in the host.
d_output_1: Output in the device.
stream: CUDA stream.
"""
# Determine dimensions and create page-locked memory buffers (which won't be swapped to disk) to hold host inputs/outputs.
h_input_1 = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type))
h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type))
# Allocate device memory for inputs and outputs.
d_input_1 = cuda.mem_alloc(h_input_1.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
return h_input_1, d_input_1, h_output, d_output, stream
def load_images_to_buffer(pics, pagelocked_buffer):
preprocessed = np.asarray(pics).ravel()
np.copyto(pagelocked_buffer, preprocessed)
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
"""
This is the function to run the inference
Args:
engine : Path to the TensorRT engine
pics_1 : Input images to the model.
h_input_1: Input in the host
d_input_1: Input in the device
h_output_1: Output in the host
d_output_1: Output in the device
stream: CUDA stream
batch_size : Batch size for execution time
height: Height of the output image
width: Width of the output image
Output:
The list of output images
"""
load_images_to_buffer(pics_1, h_input_1)
with engine.create_execution_context() as context:
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input_1, h_input_1, stream)
# Run inference.
context.profiler = trt.Profiler()
context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
# Return the host output.
out = h_output.reshape((batch_size,-1, height, width))
return out