From 21d65fbe795a062f2fd469335e9e2bfa4a647a23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com> Date: Wed, 4 Jul 2018 14:24:26 +0800 Subject: [PATCH] sort benchmarks in python code --- aibench/benchmark/benchmark.cc | 143 ++++++++++++++---------- aibench/executors/BUILD | 1 - aibench/executors/snpe/snpe_executor.cc | 28 +++-- aibench/executors/snpe/snpe_executor.h | 2 - tools/bazel.rc | 3 + tools/benchmark.py | 1 + tools/google-format.sh | 7 ++ tools/model_list.py | 22 ++++ tools/power.sh | 111 ++++++++++++++++++ tools/sh_commands.py | 23 +++- 10 files changed, 266 insertions(+), 75 deletions(-) create mode 100755 tools/google-format.sh create mode 100644 tools/model_list.py create mode 100755 tools/power.sh diff --git a/aibench/benchmark/benchmark.cc b/aibench/benchmark/benchmark.cc index b96061c..1424444 100644 --- a/aibench/benchmark/benchmark.cc +++ b/aibench/benchmark/benchmark.cc @@ -75,38 +75,42 @@ Benchmark::Benchmark(BaseExecutor *executor, input_shapes_(input_shapes), output_names_(output_names), output_shapes_(output_shapes) { - if (input_names.size() != input_shapes.size() - || (input_files.size() != input_shapes.size() && input_files.size() > 0) - || output_names.size() != output_shapes.size()) { - printf("size of input_names(%d), input_files(%d) and input_shapes(%d) " - "should be equal. sizeof output_names(%d) and output_shapes(%d) " - "should be equal.\n", - static_cast<int>(input_names.size()), - static_cast<int>(input_files.size()), - static_cast<int>(input_shapes.size()), - static_cast<int>(output_names.size()), - static_cast<int>(output_shapes.size())); + if (input_names.size() != input_shapes.size() || + (input_files.size() != input_shapes.size() && input_files.size() > 0) || + output_names.size() != output_shapes.size()) { + printf( + "size of input_names(%d), input_files(%d) and input_shapes(%d) " + "should be equal. sizeof output_names(%d) and output_shapes(%d) " + "should be equal.\n", + static_cast<int>(input_names.size()), + static_cast<int>(input_files.size()), + static_cast<int>(input_shapes.size()), + static_cast<int>(output_names.size()), + static_cast<int>(output_shapes.size())); abort(); } Register(); } // Run all benchmarks filtered by model_name -Status Benchmark::Run(const char *model_name, const char *framework, - const char *runtime, int run_interval, int num_threads) { +Status Benchmark::Run(const char *model_name, + const char *framework, + const char *runtime, + int run_interval, + int num_threads) { if (!all_benchmarks) return SUCCESS; // sort by model name, framework and runtime // the compare function tends to shuffle benchmarks by runtime std::sort(all_benchmarks->begin(), all_benchmarks->end(), [](const Benchmark *lhs, const Benchmark *rhs) { - return lhs->model_name_ < rhs->model_name_ - || (lhs->model_name_ == rhs->model_name_ - && (lhs->executor_->GetFramework() - < rhs->executor_->GetFramework() || ( - lhs->executor_->GetFramework() - == rhs->executor_->GetFramework() - && lhs->executor_->GetRuntime() != aibench::CPU))); + return lhs->model_name_ < rhs->model_name_ || + (lhs->model_name_ == rhs->model_name_ && + (lhs->executor_->GetFramework() < + rhs->executor_->GetFramework() || + (lhs->executor_->GetFramework() == + rhs->executor_->GetFramework() && + lhs->executor_->GetRuntime() != aibench::CPU))); }); // Internal perf regression tools depends on the output formatting, @@ -122,25 +126,25 @@ Status Benchmark::Run(const char *model_name, const char *framework, if (strcmp(runtime, "all") != 0 && ParseRuntime(runtime) != b->executor_->GetRuntime()) continue; - double init_seconds, run_seconds; - printf("benchmarking:%s,%d,%d\n", - b->model_name_.c_str(), - b->executor_->GetFramework(), - b->executor_->GetRuntime()); - Status status = b->Run(&init_seconds, &run_seconds, num_threads); + + // sleep run_interval seconds to cool off the target + printf("sleep %d\n", run_interval); + sleep(static_cast<uint32_t>(run_interval)); + + double init_ms, run_ms; + printf("benchmarking: %s,%d,%d\n", b->model_name_.c_str(), + b->executor_->GetFramework(), b->executor_->GetRuntime()); + Status status = b->Run(&init_ms, &run_ms, num_threads); if (status != SUCCESS) { res = status; + printf("benchmark failed: %s,%d,%d\n", b->model_name_.c_str(), + b->executor_->GetFramework(), b->executor_->GetRuntime()); continue; } // model_name,framework,runtime,init time,inference time - printf("benchmark:%s,%d,%d,%.3f,%.3f\n", - b->model_name_.c_str(), - b->executor_->GetFramework(), - b->executor_->GetRuntime(), - init_seconds * 1000, - run_seconds * 1000); - // sleep run_interval seconds to cool off the target - sleep(static_cast<uint32_t>(run_interval)); + printf("benchmark: %s,%d,%d,%.3f,%.3f\n", b->model_name_.c_str(), + b->executor_->GetFramework(), b->executor_->GetRuntime(), init_ms, + run_ms); } return res; } @@ -150,23 +154,28 @@ void Benchmark::Register() { all_benchmarks->push_back(this); } -Status Benchmark::Run(double *init_seconds, double *run_seconds, - int num_threads) { - static const int64_t kMinIters = 10; - static const int64_t kMaxIters = 1000000000; - static const double kMinTime = 2; - int64_t iters = kMinIters; +Status Benchmark::Run(double *init_ms, double *run_ms, int num_threads) { + static const int64_t kMinIters = 5; + static const int64_t kMaxIters = 20; + static const double kMinTime = 2000000; // microseconds + static const float quantile = 0.8; int64_t start_time, end_time; Status status; // Init the target's environment status = executor_->Init(model_file_.c_str(), num_threads); - if (status != SUCCESS) return status; + if (status != SUCCESS) { + executor_->Finish(); + return status; + } // prepare start_time = NowMicros(); status = executor_->Prepare(model_file_.c_str()); end_time = NowMicros(); - *init_seconds = (end_time - start_time) * 1e-6; - if (status != SUCCESS) return status; + *init_ms = (end_time - start_time) * 1e-3; + if (status != SUCCESS) { + executor_->Finish(); + return status; + } // warm-up std::map<std::string, BaseTensor> inputs; std::map<std::string, BaseTensor> outputs; @@ -202,28 +211,46 @@ Status Benchmark::Run(double *init_seconds, double *run_seconds, std::default_delete<float[]>()); outputs[output_names_[i]] = BaseTensor(output_shapes_[i], buffer_out); } - for (int i = 0; i < 5; ++i) { + + for (int i = 0; i < 2; ++i) { status = executor_->Run(inputs, &outputs); } - if (status != SUCCESS) return status; - while (true) { + if (status != SUCCESS) { + executor_->Finish(); + return status; + } + + std::vector<int64_t> durations; + int64_t total_duration = 0; + size_t benchmark_iters = 0; + + for (int i = 0; i < kMinIters || (total_duration < kMinTime && i < kMaxIters); + ++i) { start_time = NowMicros(); - for (int i = 0; i < iters; ++i) { - executor_->Run(inputs, &outputs); - } + status = executor_->Run(inputs, &outputs); end_time = NowMicros(); - const double seconds = (end_time - start_time) * 1e-6; - if (seconds >= kMinTime || iters >= kMaxIters) { - *run_seconds = seconds / iters; + durations.push_back(end_time - start_time); + total_duration += durations.back(); + if (status != SUCCESS) { executor_->Finish(); - return SUCCESS; + return status; } - - // Update number of iterations. - // Overshoot by 100% in an attempt to succeed the next time. - double multiplier = 2.0 * kMinTime / std::max(seconds, 1e-9); - iters = std::min<int64_t>(multiplier * iters, kMaxIters); + ++benchmark_iters; } + + std::sort(durations.begin(), durations.end()); + + size_t valid_iters = std::max( + static_cast<size_t>(1), static_cast<size_t>(benchmark_iters * quantile)); + size_t start_iter = (benchmark_iters - valid_iters) / 2; + valid_iters = std::min(valid_iters, benchmark_iters - start_iter); + total_duration = + std::accumulate(durations.begin() + start_iter, + durations.begin() + (start_iter + valid_iters), 0); + + *run_ms = total_duration * 1e-3 / valid_iters; + executor_->Finish(); + return SUCCESS; } int64_t NowMicros() { diff --git a/aibench/executors/BUILD b/aibench/executors/BUILD index fc8fd29..9510250 100644 --- a/aibench/executors/BUILD +++ b/aibench/executors/BUILD @@ -106,7 +106,6 @@ cc_library( hdrs = [ "tflite/tflite_executor.h", ], - deps = [ ":base_executor", ] + if_android_armv7([ diff --git a/aibench/executors/snpe/snpe_executor.cc b/aibench/executors/snpe/snpe_executor.cc index 91b96c9..206d1bd 100644 --- a/aibench/executors/snpe/snpe_executor.cc +++ b/aibench/executors/snpe/snpe_executor.cc @@ -71,13 +71,12 @@ Status ProcessInput(zdl::SNPE::SNPE *snpe, std::cerr << "inputs size not matched" << std::endl; return Status::RUNTIME_ERROR; } - std::unique_ptr<zdl::DlSystem::ITensor> input_tensor; for (size_t i = 0; i < input_tensor_names.size(); i++) { std::string input_name(input_tensor_names.at(i)); const auto &input_shape_opt = snpe->getInputDimensions(input_tensor_names.at(i)); const auto &input_shape = *input_shape_opt; - input_tensor = + std::unique_ptr<zdl::DlSystem::ITensor> input_tensor = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(input_shape); size_t input_size = inputs.at(input_name).size(); @@ -136,25 +135,34 @@ Status SnpeExecutor::Prepare(const char *model_name) { Status SnpeExecutor::Run(const std::map<std::string, BaseTensor> &inputs, std::map<std::string, BaseTensor> *outputs) { - Status status; + Status status = SUCCESS; + + zdl::DlSystem::TensorMap input_tensor_map; + zdl::DlSystem::TensorMap output_tensor_map; + // step1: prepare inputs - input_tensor_map_.clear(); - status = ProcessInput(snpe_.get(), inputs, &input_tensor_map_); + status = ProcessInput(snpe_.get(), inputs, &input_tensor_map); if (status != Status::SUCCESS) return status; // step2: execute - output_tensor_map_.clear(); - snpe_.get()->execute(input_tensor_map_, output_tensor_map_); + snpe_.get()->execute(input_tensor_map, output_tensor_map); // step3: process output - status = ProcessOutput(output_tensor_map_, outputs); + status = ProcessOutput(output_tensor_map, outputs); + + auto tensor_names = input_tensor_map.getTensorNames(); + for (size_t i = 0; i < tensor_names.size(); ++i) { + std::string input_name(tensor_names.at(i)); + zdl::DlSystem::ITensor* input_tensor = + input_tensor_map.getTensor(input_name.c_str()); + delete input_tensor; + } + return status; } void SnpeExecutor::Finish() { if (snpe_ != nullptr) snpe_.reset(); - input_tensor_map_.clear(); - output_tensor_map_.clear(); } } // namespace aibench diff --git a/aibench/executors/snpe/snpe_executor.h b/aibench/executors/snpe/snpe_executor.h index c06bce1..94f1b4f 100644 --- a/aibench/executors/snpe/snpe_executor.h +++ b/aibench/executors/snpe/snpe_executor.h @@ -38,8 +38,6 @@ class SnpeExecutor : public BaseExecutor { virtual void Finish(); private: std::unique_ptr<zdl::SNPE::SNPE> snpe_; - zdl::DlSystem::TensorMap input_tensor_map_; - zdl::DlSystem::TensorMap output_tensor_map_; }; } // namespace aibench diff --git a/tools/bazel.rc b/tools/bazel.rc index 267345d..93a9b8f 100644 --- a/tools/bazel.rc +++ b/tools/bazel.rc @@ -1,5 +1,8 @@ build --verbose_failures build --copt=-std=c++11 +build --copt=-O3 +build --copt=-ffast-math +build --copt=-Ofast build --strategy=CppCompile=standalone # By default, we don't distinct target and host platfroms. diff --git a/tools/benchmark.py b/tools/benchmark.py index ba607bc..ada01fa 100644 --- a/tools/benchmark.py +++ b/tools/benchmark.py @@ -185,6 +185,7 @@ def main(unused_args): all_prepare = [] all_run_avg = [] for target_abi in target_abis: + print("Prepare to run models on %s" % target_abi) if target_abi not in abi_types: print("Not supported abi: %s" % target_abi) continue diff --git a/tools/google-format.sh b/tools/google-format.sh new file mode 100755 index 0000000..6d8bf9d --- /dev/null +++ b/tools/google-format.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +clang-format-3.9 \ + -style="{BasedOnStyle: google, \ + DerivePointerAlignment: false, \ + PointerAlignment: Right, \ + BinPackParameters: false}" -i $1 diff --git a/tools/model_list.py b/tools/model_list.py new file mode 100644 index 0000000..e87e2c4 --- /dev/null +++ b/tools/model_list.py @@ -0,0 +1,22 @@ +# Copyright 2018 Xiaomi, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +BENCHMARK_MODELS = ( + "MobileNetV1", + "MobileNetV2", + "SqueezeNetV11", + "InceptionV3", + "VGG16" +) diff --git a/tools/power.sh b/tools/power.sh new file mode 100755 index 0000000..e9b34c5 --- /dev/null +++ b/tools/power.sh @@ -0,0 +1,111 @@ +SERIALNO=$1 +PLATFORM=$2 +ADB="adb -s $SERIALNO" + +echo "Adjust power to performance mode on $SERIALNO, $PLATFORM" + +$ADB root || exit 1 +$ADB wait-for-device +$ADB remount +$ADB wait-for-device + +$ADB shell "stop thermald" +$ADB shell "stop mpdecision" +# disable thermal +$ADB shell "stop thermal-engine && stop thermal-hal-1-0" +# stop perflock HAL +$ADB shell "stop perf-hal-1-0" + +# boost cpu freq +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu0/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu1/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu2/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu3/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu4/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu5/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu6/online" +$ADB shell "echo 1 > /sys/devices/system/cpu/cpu7/online" + +$ADB shell "echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu4/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu5/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu6/cpufreq/scaling_governor" +$ADB shell "echo performance > /sys/devices/system/cpu/cpu7/cpufreq/scaling_governor" + +# bw vote max +$ADB shell "echo performance > /sys/class/devfreq/1d84000.ufshc/governor" +$ADB shell "echo performance > /sys/class/devfreq/5000000.qcom,kgsl-3d0/governor" +$ADB shell "echo performance > /sys/class/devfreq/aa00000.qcom,vidc:arm9_bus_ddr/governor" +$ADB shell "echo performance > /sys/class/devfreq/aa00000.qcom,vidc:bus_cnoc/governor" +$ADB shell "echo performance > /sys/class/devfreq/aa00000.qcom,vidc:venus_bus_ddr/governor" +$ADB shell "echo performance > /sys/class/devfreq/aa00000.qcom,vidc:venus_bus_llcc/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,cpubw/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,gpubw/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,kgsl-busmon/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,l3-cdsp/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,l3-cpu0/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,l3-cpu4/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,llccbw/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,memlat-cpu0/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,memlat-cpu4/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,mincpubw/governor" +$ADB shell "echo performance > /sys/class/devfreq/soc:qcom,snoc_cnoc_keepalive/governor" + +# boost gpu freq +$ADB shell "echo 0 > /sys/class/kgsl/kgsl-3d0/min_pwrlevel" +$ADB shell "echo 0 > /sys/class/kgsl/kgsl-3d0/max_pwrlevel" +$ADB shell "echo performance > /sys/class/kgsl/kgsl-3d0/devfreq/governor" +$ADB shell "cat /sys/class/kgsl/kgsl-3d0/gpuclk" +$ADB shell "echo 1000000 > /sys/class/kgsl/kgsl-3d0/idle_timer" +$ADB shell "echo 1 > /d/dri/0/debug/core_perf/perf_mode" + + +$ADB shell "echo 4 > /sys/devices/system/cpu/cpu0/core_ctl/min_cpus" +$ADB shell "echo 4 > /sys/devices/system/cpu/cpu4/core_ctl/min_cpus" +$ADB shell "echo 35 > /proc/sys/kernel/sched_downmigrate && echo 55 > /proc/sys/kernel/sched_upmigrate" +$ADB shell "echo 512 > /sys/block/sda/queue/nr_requests && echo 1024 > /sys/block/sda/queue/read_ahead_kb" + +#$ADB shell "echo 100 > /proc/sys/kernel/sched_cfs_boost" +$ADB shell "echo 100 > /dev/stune/top-app/schedtune.boost" +$ADB shell "echo 1 > /dev/stune/top-app/schedtune.prefer_idle" + +# disable all level LPM by sysfs node +$ADB shell "echo Y > /sys/module/lpm_levels/parameters/sleep_disabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu0/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu0/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu1/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu1/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu2/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu2/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu3/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu3/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu4/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu4/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu5/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu5/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu6/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu6/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu7/pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/cpu7/rail-pc/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/l3-wfi/idle_enabled" +$ADB shell "echo N > /sys/module/lpm_levels/L3/llcc-off/idle_enabled" +if [ "$PLATFORM" == "sdm660" ]; then + $ADB shell "echo N > /sys/module/lpm_levels/system/perf/cpu4/pc/idle_enabled" + $ADB shell "echo N > /sys/module/lpm_levels/system/perf/cpu5/pc/idle_enabled" + $ADB shell "echo N > /sys/module/lpm_levels/system/perf/cpu6/pc/idle_enabled" + $ADB shell "echo N > /sys/module/lpm_levels/system/perf/cpu7/pc/idle_enabled" +fi + +# set ddr config. +$ADB shell "echo 100 > /proc/sys/kernel/sched_initial_task_util" +if [ "$PLATFORM" == "sdm660" ]; then + $ADB shell "echo 100 > /proc/sys/kernel/sched_init_task_load" # for 660 + $ADB shell "echo 1 >/sys/kernel/debug/msm-bus-dbg/shell-client/mas" + $ADB shell "echo 512 > /sys/kernel/debug/msm-bus-dbg/shell-client/slv" + $ADB shell "echo 28864000000 > /sys/kernel/debug/msm-bus-dbg/shell-client/ab" + $ADB shell "echo 28864000000 > /sys/kernel/debug/msm-bus-dbg/shell-client/ib" + $ADB shell "echo 1 > /sys/kernel/debug/msm-bus-dbg/shell-client/update_request" +fi \ No newline at end of file diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 237257f..425a8c8 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -19,6 +19,8 @@ import sh import urllib +from model_list import BENCHMARK_MODELS + FRAMEWORKS = ( "MACE", @@ -309,6 +311,13 @@ def adb_run(abi, print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"], props["ro.product.model"])) + try: + sh.bash("tools/power.sh", + serialno, props["ro.board.platform"], + _fg=True) + except Exception, e: + print("Config power exception %s" % str(e)) + sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path) sh.adb("-s", serialno, "shell", "rm -rf %s" % os.path.join(device_bin_path, "interior")) @@ -326,11 +335,17 @@ def adb_run(abi, cmd = "cd %s; ADSP_LIBRARY_PATH='.;/system/lib/rfsa/adsp;/system" \ "/vendor/lib/rfsa/adsp;/dsp'; LD_LIBRARY_PATH=. " \ "./model_benchmark" % device_bin_path - if set(frameworks) == set(FRAMEWORKS): - frameworks = ["all"] - for framework in frameworks: - for runtime in runtimes: + if frameworks == ['all']: + frameworks = FRAMEWORKS + if runtimes == ['all']: + runtimes = RUNTIMES + if model_names == ['all']: + model_names = BENCHMARK_MODELS + + for runtime in runtimes: + for framework in frameworks: for model_name in model_names: + print(framework, runtime, model_name) args = "--run_interval=%d --num_threads=%d " \ "--framework=%s --runtime=%s --model_name=%s " \ "--product_soc=%s.%s" % \