Skip to content

Commit

Permalink
VITIS-12131 and VITIS-11466 add command chaining benchmarks (Xilinx#8238
Browse files Browse the repository at this point in the history
)

* cmd-chain benchmarks

Signed-off-by: AShivangi <[email protected]>

* build fix

Signed-off-by: AShivangi <[email protected]>

* build fix

Signed-off-by: AShivangi <[email protected]>

---------

Signed-off-by: AShivangi <[email protected]>
  • Loading branch information
AShivangi authored Jun 13, 2024
1 parent 420a82e commit 8770b19
Show file tree
Hide file tree
Showing 6 changed files with 346 additions and 2 deletions.
152 changes: 152 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestCmdChainLatency.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestCmdChainLatency.h"
#include "tools/common/XBUtilities.h"
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_hw_context.h"
#include "xrt/xrt_kernel.h"
#include <experimental/xrt_kernel.h>
namespace XBU = XBUtilities;

#include <filesystem>

static constexpr size_t buffer_size = 20;
static constexpr int itr_count = 10;
static constexpr int run_count = 1000;

// ----- C L A S S M E T H O D S -------------------------------------------
TestCmdChainLatency::TestCmdChainLatency()
: TestRunner("cmd-chain-latency", "Run end-to-end latency test using command chaining")
{}

boost::property_tree::ptree
TestCmdChainLatency::run(std::shared_ptr<xrt_core::device> dev)
{
boost::property_tree::ptree ptree = get_test_header();

const auto xclbin_name = xrt_core::device_query<xrt_core::query::xclbin_name>(dev, xrt_core::query::xclbin_name::type::validate);
auto xclbin_path = findPlatformFile(xclbin_name, ptree);
if (!std::filesystem::exists(xclbin_path))
return ptree;

logger(ptree, "Xclbin", xclbin_path);

xrt::xclbin xclbin;
try {
xclbin = xrt::xclbin(xclbin_path);
}
catch (const std::runtime_error& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
return ptree;
}

// Determine The DPU Kernel Name
auto xkernels = xclbin.get_kernels();

auto itr = std::find_if(xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel& k) {
auto name = k.get_name();
return name.rfind("DPU",0) == 0; // Starts with "DPU"
});

xrt::xclbin::kernel xkernel;
if (itr!=xkernels.end())
xkernel = *itr;
else {
logger(ptree, "Error", "No kernel with `DPU` found in the xclbin");
ptree.put("status", test_token_failed);
return ptree;
}
auto kernelName = xkernel.get_name();
if(XBU::getVerbose())
logger(ptree, "Details", boost::str(boost::format("Kernel name is '%s'") % kernelName));

auto working_dev = xrt::device(dev);
working_dev.register_xclbin(xclbin);
xrt::hw_context hwctx{working_dev, xclbin.get_uuid()};
xrt::kernel testker{hwctx, kernelName};

// Find PS kernel instance as expected by KMD, but
// construct the xrt::kernel from the CU base name
std::string kernel_name;
xrt::xclbin::ip cu;
for (const auto& ip : xclbin.get_ips()) {
if (ip.get_type() != xrt::xclbin::ip::ip_type::ps)
continue;

cu = ip;
auto cu_name = cu.get_name();
kernel_name = cu_name.substr(0, cu_name.find(':'));
break;
}

// create specified number of runs and populate with arguments
std::vector<xrt::bo> global_args;
std::vector<xrt::run> runs;

for (int i=0; i < run_count; ++i) {
auto run = xrt::run(testker);
for (const auto& arg : cu.get_args()) {
auto arg_idx = static_cast<int>(arg.get_index());
if (arg.get_host_type() == "uint64_t")
run.set_arg(arg_idx, static_cast<uint64_t>(1));
else if (arg.get_host_type() == "uint32_t")
run.set_arg(arg_idx, static_cast<uint32_t>(1));
else if (arg.get_host_type().find('*') != std::string::npos) {
xrt::bo bo;

if (arg.get_name() == "instruct")
bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx));
else
bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx));

bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
global_args.push_back(bo);
run.set_arg(arg_idx, bo);
}
}
runs.push_back(std::move(run));
}

//Log
if(XBU::getVerbose()) {
logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size));
logger(ptree, "Details", boost::str(boost::format("No. of commands: '%f'") % (itr_count*run_count)));
}

// Start via runlist
xrt::runlist runlist{hwctx};
for (auto& run : runs)
runlist.add(run);

auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < itr_count; ++i) {
try {
runlist.execute();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}

try {
runlist.wait();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();

// Calculate end-to-end latency of one job execution
const float latency = (elapsedSecs / (itr_count*run_count)) * 1000000; //convert s to us
logger(ptree, "Details", boost::str(boost::format("Average latency: '%.1f' us") % latency));
ptree.put("status", test_token_passed);
return ptree;
}
18 changes: 18 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestCmdChainLatency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

#ifndef __TestCmdChainLatency_h_
#define __TestCmdChainLatency_h_

#include "tools/common/TestRunner.h"
#include "xrt/xrt_device.h"

class TestCmdChainLatency : public TestRunner {
public:
boost::property_tree::ptree run(std::shared_ptr<xrt_core::device> dev);

public:
TestCmdChainLatency();
};

#endif
152 changes: 152 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestCmdChainThroughput.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestCmdChainThroughput.h"
#include "tools/common/XBUtilities.h"
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_hw_context.h"
#include "xrt/xrt_kernel.h"
#include <experimental/xrt_kernel.h>
namespace XBU = XBUtilities;

#include <filesystem>

static constexpr size_t buffer_size = 20;
static constexpr int itr_count = 10;
static constexpr int run_count = 1000;

// ----- C L A S S M E T H O D S -------------------------------------------
TestCmdChainThroughput::TestCmdChainThroughput()
: TestRunner("cmd-chain-throughput", "Run end-to-end throughput test using command chaining")
{}

boost::property_tree::ptree
TestCmdChainThroughput::run(std::shared_ptr<xrt_core::device> dev)
{
boost::property_tree::ptree ptree = get_test_header();

const auto xclbin_name = xrt_core::device_query<xrt_core::query::xclbin_name>(dev, xrt_core::query::xclbin_name::type::validate);
auto xclbin_path = findPlatformFile(xclbin_name, ptree);
if (!std::filesystem::exists(xclbin_path))
return ptree;

logger(ptree, "Xclbin", xclbin_path);

xrt::xclbin xclbin;
try {
xclbin = xrt::xclbin(xclbin_path);
}
catch (const std::runtime_error& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
return ptree;
}

// Determine The DPU Kernel Name
auto xkernels = xclbin.get_kernels();

auto itr = std::find_if(xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel& k) {
auto name = k.get_name();
return name.rfind("DPU",0) == 0; // Starts with "DPU"
});

xrt::xclbin::kernel xkernel;
if (itr!=xkernels.end())
xkernel = *itr;
else {
logger(ptree, "Error", "No kernel with `DPU` found in the xclbin");
ptree.put("status", test_token_failed);
return ptree;
}
auto kernelName = xkernel.get_name();
if(XBU::getVerbose())
logger(ptree, "Details", boost::str(boost::format("Kernel name is '%s'") % kernelName));

auto working_dev = xrt::device(dev);
working_dev.register_xclbin(xclbin);
xrt::hw_context hwctx{working_dev, xclbin.get_uuid()};
xrt::kernel testker{hwctx, kernelName};

// Find PS kernel instance as expected by KMD, but
// construct the xrt::kernel from the CU base name
std::string kernel_name;
xrt::xclbin::ip cu;
for (const auto& ip : xclbin.get_ips()) {
if (ip.get_type() != xrt::xclbin::ip::ip_type::ps)
continue;

cu = ip;
auto cu_name = cu.get_name();
kernel_name = cu_name.substr(0, cu_name.find(':'));
break;
}

// create specified number of runs and populate with arguments
std::vector<xrt::bo> global_args;
std::vector<xrt::run> runs;

for (int i=0; i < run_count; ++i) {
auto run = xrt::run(testker);
for (const auto& arg : cu.get_args()) {
auto arg_idx = static_cast<int>(arg.get_index());
if (arg.get_host_type() == "uint64_t")
run.set_arg(arg_idx, static_cast<uint64_t>(1));
else if (arg.get_host_type() == "uint32_t")
run.set_arg(arg_idx, static_cast<uint32_t>(1));
else if (arg.get_host_type().find('*') != std::string::npos) {
xrt::bo bo;

if (arg.get_name() == "instruct")
bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx));
else
bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx));

bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
global_args.push_back(bo);
run.set_arg(arg_idx, bo);
}
}
runs.push_back(std::move(run));
}

//Log
if(XBU::getVerbose()) {
logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size));
logger(ptree, "Details", boost::str(boost::format("No. of commands: '%f'") % (itr_count*run_count)));
}

// Start via runlist
xrt::runlist runlist{hwctx};
for (auto& run : runs)
runlist.add(run);

auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < itr_count; ++i) {
try {
runlist.execute();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}

try {
runlist.wait();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();

// Compute the throughput
const double throughput = ((itr_count*run_count) / elapsedSecs);
logger(ptree, "Details", boost::str(boost::format("Average throughput: '%.1f' ops") % throughput));
ptree.put("status", test_token_passed);
return ptree;
}
18 changes: 18 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestCmdChainThroughput.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

#ifndef __TestCmdChainThroughput_h_
#define __TestCmdChainThroughput_h_

#include "tools/common/TestRunner.h"
#include "xrt/xrt_device.h"

class TestCmdChainThroughput : public TestRunner {
public:
boost::property_tree::ptree run(std::shared_ptr<xrt_core::device> dev);

public:
TestCmdChainThroughput();
};

#endif
6 changes: 5 additions & 1 deletion src/runtime_src/core/tools/xbutil2/SubCmdValidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#include "tools/common/tests/TestGemm.h"
#include "tools/common/tests/TestNPUThroughput.h"
#include "tools/common/tests/TestNPULatency.h"
#include "tools/common/tests/TestCmdChainLatency.h"
#include "tools/common/tests/TestCmdChainThroughput.h"
namespace XBU = XBUtilities;

// 3rd Party Library - Include Files
Expand Down Expand Up @@ -108,7 +110,9 @@ std::vector<std::shared_ptr<TestRunner>> testSuite = {
std::make_shared<TestTCTAllColumn>(),
std::make_shared<TestGemm>(),
std::make_shared<TestNPUThroughput>(),
std::make_shared<TestNPULatency>()
std::make_shared<TestNPULatency>(),
std::make_shared<TestCmdChainLatency>(),
std::make_shared<TestCmdChainThroughput>()
};

/*
Expand Down
2 changes: 1 addition & 1 deletion src/runtime_src/core/tools/xbutil2/xbutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ R"(
}]
},{
"validate": [{
"test": ["latency", "throughput", "df-bw", "tct-one-col", "tct-all-col", "gemm"]
"test": ["latency", "throughput", "cmd-chain-latency", "cmd-chain-throughput", "df-bw", "tct-one-col", "tct-all-col", "gemm"]
}]
}]
}]
Expand Down

0 comments on commit 8770b19

Please sign in to comment.