Skip to content

Commit

Permalink
PyTorch NNAPI integration prototype (pytorch#46780)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#46780

This is in prototype status, but pretty functional.  There are two major
parts.

- Model converter.  This is a pure Python component that consumes a
  model in TorchScript format, converts the operations into NNAPI
  semantics, and serializes the model in a custom format.  It then wraps
  the result in a new TorchScript model that can invoke NNAPI under the
  hood.
- Runtime.  This is a TorchBind object that deserializes the model and
  sends the result to NNAPI.  This is fairly simple since the serialized
  format is basically just a list of NNAPI calls to make, so most of the
  code is spent on bounds checking.

A few notes on the design.
- Currently, all tensor sizes need to be fixed, and those fixed sizes
  are burned directly into the serialized model.  This will probably
  need to change.  NNAPI supports variable-sized tensors, but the
  important hardware backends do not.  However, we're seeing use cases
  crop up where the input size is not known until around the time that
  the model is loaded (for example, it might depend on the camera aspect
  ratio).  I think the proper fix here is to remove the code in the
  converter that eagerly calculates the sizes of the intermediate
  tensors and replace it with a code generator that will generate some
  TorchScript code that will perform those calculations at model load
  time.  This way, we will be able to support models that have
  variable-sized inputs while still only showing fixed-sized operands to
  NNAPI.
- The important hardware backends want operands to be in NHWC order, but
  PyTorch natively represents all tensors and NCHW.  The strategy for
  this is to keep NCHW during most of the conversion process, but track
  and additional value per operand representing the "dimension order".
  The dimension order gets propagated through convolutions and pointwise
  ops.  When we're ready to serialize the model, we reorder the
  dimensions for "channels last" operands to NHWC.

Test Plan:
Some local testing with FB prod models.  I'll need to add some examples
and automated tests.

Reviewed By: iseeyuan

Differential Revision: D24574040

Pulled By: dreiss

fbshipit-source-id: 6adc8571b234877ee3666ec0c0de24da35c38a1f
  • Loading branch information
dreiss authored and facebook-github-bot committed Nov 6, 2020
1 parent ad8c0e5 commit 9a9383e
Show file tree
Hide file tree
Showing 14 changed files with 2,712 additions and 1 deletion.
3 changes: 2 additions & 1 deletion aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ append_filelist("jit_core_headers" ATen_CORE_HEADERS)
append_filelist("jit_core_sources" ATen_CORE_SRCS)

add_subdirectory(quantized)
set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${native_utils_cpp} ${native_xnnpack} ${generated_cpp} ${core_generated_cpp} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${cpu_kernel_cpp})
add_subdirectory(nnapi)
set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${native_utils_cpp} ${native_xnnpack} ${generated_cpp} ${core_generated_cpp} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${ATen_NNAPI_SRCS} ${cpu_kernel_cpp})
if(AT_MKL_ENABLED)
set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
endif()
Expand Down
21 changes: 21 additions & 0 deletions aten/src/ATen/nnapi/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Define this to build the NNAPI binding out of tree.
if(PYTORCH_NNAPI_STANDALONE)
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
project(pytorch_nnapi)

set(CMAKE_CXX_STANDARD 14)
find_package(Torch REQUIRED)

set(NNAPI_SRCS
nnapi_bind.cpp
nnapi_wrapper.cpp
nnapi_model_loader.cpp
)

add_library(pytorch_nnapi SHARED ${NNAPI_SRCS})
target_link_libraries(pytorch_nnapi torch)
else()
# Building within the PyTorch tree.
file(GLOB ATen_NNAPI_SRCS "*.cpp")
set(ATen_NNAPI_SRCS ${ATen_NNAPI_SRCS} PARENT_SCOPE)
endif()
84 changes: 84 additions & 0 deletions aten/src/ATen/nnapi/NeuralNetworks.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
Most of NeuralNetworks.h has been stripped for simplicity.
We don't need any of the function declarations since
we call them all through dlopen/dlsym.
Operation codes are pulled directly from serialized models.
*/

#ifndef MINIMAL_NEURAL_NETWORKS_H
#define MINIMAL_NEURAL_NETWORKS_H

#include <stdint.h>

typedef enum {
ANEURALNETWORKS_NO_ERROR = 0,
ANEURALNETWORKS_OUT_OF_MEMORY = 1,
ANEURALNETWORKS_INCOMPLETE = 2,
ANEURALNETWORKS_UNEXPECTED_NULL = 3,
ANEURALNETWORKS_BAD_DATA = 4,
ANEURALNETWORKS_OP_FAILED = 5,
ANEURALNETWORKS_BAD_STATE = 6,
ANEURALNETWORKS_UNMAPPABLE = 7,
ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
} ResultCode;

typedef enum {
ANEURALNETWORKS_FLOAT32 = 0,
ANEURALNETWORKS_INT32 = 1,
ANEURALNETWORKS_UINT32 = 2,
ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
ANEURALNETWORKS_TENSOR_INT32 = 4,
ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
ANEURALNETWORKS_BOOL = 6,
ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7,
ANEURALNETWORKS_TENSOR_FLOAT16 = 8,
ANEURALNETWORKS_TENSOR_BOOL8 = 9,
ANEURALNETWORKS_FLOAT16 = 10,
ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
ANEURALNETWORKS_TENSOR_QUANT16_ASYMM = 12,
ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
} OperandCode;

typedef enum {
ANEURALNETWORKS_PREFER_LOW_POWER = 0,
ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
} PreferenceCode;

typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
typedef struct ANeuralNetworksModel ANeuralNetworksModel;
typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;

typedef int32_t ANeuralNetworksOperationType;

typedef struct ANeuralNetworksOperandType {
int32_t type;
uint32_t dimensionCount;
const uint32_t* dimensions;
float scale;
int32_t zeroPoint;
} ANeuralNetworksOperandType;

#endif // MINIMAL_NEURAL_NETWORKS_H
161 changes: 161 additions & 0 deletions aten/src/ATen/nnapi/codegen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""
Code generator for NNAPI wrapper. We can't link directly against
libneuralnetworks.so because we want PyTorch to work on Android
devices that don't have it available. Instead, we generate a wrapper
that opens libneuralnetworks.so with dlopen and finds the functions
we need with dlsym. We also generate a "check" wrapper that checks
return values and throws C++ exceptions on errors.
"""
import sys
import re
import pathlib
import textwrap


PREFIX = """\
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file is generated by nnapi/codegen.py
"""


NNAPI_FUNCTIONS = [
("int", "ANeuralNetworks_getDeviceCount", "uint32_t* numDevices"), # noqa: B950
("int", "ANeuralNetworks_getDevice", "uint32_t devIndex, ANeuralNetworksDevice** device"), # noqa: B950
("int", "ANeuralNetworksDevice_getName", "const ANeuralNetworksDevice* device, const char** name"), # noqa: B950
("int", "ANeuralNetworksDevice_getVersion", "const ANeuralNetworksDevice* device, const char** version"), # noqa: B950
("int", "ANeuralNetworksDevice_getFeatureLevel", "const ANeuralNetworksDevice* device, int64_t* featureLevel"), # noqa: B950
("int", "ANeuralNetworksModel_getSupportedOperationsForDevices", " const ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, bool* supportedOps"), # noqa: B950
("int", "ANeuralNetworksCompilation_createForDevices", "ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, ANeuralNetworksCompilation** compilation"), # noqa: B950
("int", "ANeuralNetworksExecution_compute", "ANeuralNetworksExecution* execution"), # noqa: B950
("int", "ANeuralNetworksMemory_createFromFd", "size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory** memory"), # noqa: B950
("void", "ANeuralNetworksMemory_free", "ANeuralNetworksMemory* memory"), # noqa: B950
("int", "ANeuralNetworksModel_create", "ANeuralNetworksModel** model"), # noqa: B950
("void", "ANeuralNetworksModel_free", "ANeuralNetworksModel* model"), # noqa: B950
("int", "ANeuralNetworksModel_finish", "ANeuralNetworksModel* model"), # noqa: B950
("int", "ANeuralNetworksModel_addOperand", "ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type"), # noqa: B950
("int", "ANeuralNetworksModel_setOperandValue", "ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length"), # noqa: B950
("int", "ANeuralNetworksModel_setOperandValueFromMemory", "ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksMemory* memory, size_t offset, size_t length"), # noqa: B950
("int", "ANeuralNetworksModel_addOperation", "ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs"), # noqa: B950
("int", "ANeuralNetworksModel_identifyInputsAndOutputs", "ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs"), # noqa: B950
("int", "ANeuralNetworksModel_relaxComputationFloat32toFloat16", "ANeuralNetworksModel* model, bool allow"), # noqa: B950
("int", "ANeuralNetworksCompilation_create", "ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation"), # noqa: B950
("void", "ANeuralNetworksCompilation_free", "ANeuralNetworksCompilation* compilation"), # noqa: B950
("int", "ANeuralNetworksCompilation_setPreference", "ANeuralNetworksCompilation* compilation, int32_t preference"), # noqa: B950
("int", "ANeuralNetworksCompilation_finish", "ANeuralNetworksCompilation* compilation"), # noqa: B950
("int", "ANeuralNetworksExecution_create", "ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution"), # noqa: B950
("void", "ANeuralNetworksExecution_free", "ANeuralNetworksExecution* execution"), # noqa: B950
("int", "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length"), # noqa: B950
("int", "ANeuralNetworksExecution_setInputFromMemory", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length"), # noqa: B950
("int", "ANeuralNetworksExecution_setOutput", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length"), # noqa: B950
("int", "ANeuralNetworksExecution_setOutputFromMemory", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length"), # noqa: B950
("int", "ANeuralNetworksExecution_startCompute", "ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event"), # noqa: B950
("int", "ANeuralNetworksEvent_wait", "ANeuralNetworksEvent* event"), # noqa: B950
("void", "ANeuralNetworksEvent_free", "ANeuralNetworksEvent* event"), # noqa: B950
("int", "ANeuralNetworksExecution_getOutputOperandRank", "ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank"), # noqa: B950
("int", "ANeuralNetworksExecution_getOutputOperandDimensions", "ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions"), # noqa: B950
]


def main(argv):
struct_members = []
load_functions = []
define_checks = []

for ret, name, args in NNAPI_FUNCTIONS:
short_name = name.replace("ANeuralNetworks", "", 1)

struct_members.append(f" {ret}(*{short_name})({args});")

load_functions.append(f' *(void**)&nnapi_.{short_name} = dlsym(handle, "{name}");')
load_functions.append(f' check_nnapi_.{short_name} = check_{short_name};')

call_args = "".join(re.findall(r"\w+(?:,|$)", args))
if ret == "void":
define_checks.append(textwrap.dedent(f"""\
{ret} check_{short_name}({args}) {{
CAFFE_ENFORCE(nnapi_.{short_name});
nnapi_.{short_name}({call_args});
}}"""))
if ret == "int":
define_checks.append(textwrap.dedent(f"""\
{ret} check_{short_name}({args}) {{
CAFFE_ENFORCE(nnapi_.{short_name});
int ret = nnapi_.{short_name}({call_args});
// TODO: Maybe add better logging here.
CAFFE_ENFORCE(ret == ANEURALNETWORKS_NO_ERROR);
return ret;
}}"""))

out_dir = pathlib.Path(__file__).parent

(out_dir / "nnapi_wrapper.h").write_text(
PREFIX +
textwrap.dedent("""\
#ifndef NNAPI_WRAPPER_H_
#define NNAPI_WRAPPER_H_
#include <stddef.h>
#include <stdint.h>
#include <ATen/nnapi/NeuralNetworks.h>
struct nnapi_wrapper {
__STRUCT_MEMBERS__
};
#ifdef __cplusplus
void nnapi_wrapper_load(struct nnapi_wrapper** nnapi, struct nnapi_wrapper** check_nnapi);
#endif
#endif
""")
.replace("__STRUCT_MEMBERS__", "\n".join(struct_members))
)

(out_dir / "nnapi_wrapper.cpp").write_text(
PREFIX +
textwrap.dedent("""\
#ifndef _WIN32
#include <dlfcn.h>
#endif
#include <ATen/nnapi/nnapi_wrapper.h>
#include <c10/util/Logging.h>
static int loaded = 0;
static struct nnapi_wrapper nnapi_;
static struct nnapi_wrapper check_nnapi_;
__DEFINE_CHECK_FUNCTIONS__
void nnapi_wrapper_load(struct nnapi_wrapper** nnapi, struct nnapi_wrapper** check_nnapi) {
#ifdef _WIN32
TORCH_CHECK(false, "Running NNAPI models is not supported on Windows.");
#else
if (!loaded) {
// Clear error flag.
dlerror();
void* handle = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
CAFFE_ENFORCE(handle, "Failed to load libneuralnetworks.so ", dlerror());
__LOAD_FUNCTIONS__
loaded = 1;
}
*nnapi = &nnapi_;
*check_nnapi = &check_nnapi_;
#endif
}
""")
.replace("__DEFINE_CHECK_FUNCTIONS__", "\n".join(define_checks))
.replace("__LOAD_FUNCTIONS__", "\n".join(load_functions))
)


if __name__ == "__main__":
sys.exit(main(sys.argv))
Loading

0 comments on commit 9a9383e

Please sign in to comment.