Skip to content

Commit

Permalink
Support ERT_EXEC_WRITE with ERT (Xilinx#1672)
Browse files Browse the repository at this point in the history
  • Loading branch information
stsoe authored Jul 17, 2019
1 parent cae7435 commit 311ef77
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 51 deletions.
11 changes: 8 additions & 3 deletions src/include/1_2/CL/cl_ext_xilinx.h
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,10 @@ xclGetXrtDevice(cl_device_id device,
* @type: cl_uint
* @return: XRT scheduler index of compute unit
*
* @XCL_COMPUTE_UNIT_BASE_ADDRESS:
* @type: size_t
* @return: Base address of compute unit
*
* @XCL_COMPUTE_UNIT_CONNECTIONS:
* @type: cl_ulong
* @return: Memory connection for each compute unit argument.
Expand All @@ -427,9 +431,10 @@ xclGetComputeUnitInfo(cl_kernel kernel,
void * param_value,
size_t * param_value_size_ret );

#define XCL_COMPUTE_UNIT_NAME 0x1320 // name of CU
#define XCL_COMPUTE_UNIT_INDEX 0x1321 // scheduler index of CU
#define XCL_COMPUTE_UNIT_CONNECTIONS 0x1322 // connectivity
#define XCL_COMPUTE_UNIT_NAME 0x1320 // name of CU
#define XCL_COMPUTE_UNIT_INDEX 0x1321 // scheduler index of CU
#define XCL_COMPUTE_UNIT_CONNECTIONS 0x1322 // connectivity
#define XCL_COMPUTE_UNIT_BASE_ADDRESS 0x1323 // base address

/*
Host Accessible Program Scope Globals
Expand Down
63 changes: 48 additions & 15 deletions src/runtime_src/ert/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,9 @@ struct slot_info
// free [0x4]: the command slot is free
value_type header_value = 0;

// Cache opcode
value_type opcode = 0;

// Bitset of CUs that can be used by current command in slot
bitset_type cus;

Expand Down Expand Up @@ -357,6 +360,15 @@ opcode(value_type header_value)
{
return (header_value >> 23) & 0x1F;
}

/**
* Command type [31:28]
*/
inline value_type
cmd_type(value_type header_value)
{
return (header_value >> 28) & 0xF;
}

/**
* Command header [22:12] is payload size
Expand Down Expand Up @@ -642,13 +654,30 @@ notify_host(size_type cmd_idx)
inline void
configure_cu(addr_type cu_addr, addr_type regmap_addr, size_type regmap_size)
{
// write register map, starting at base + 0xC
// 0x4, 0x8 used for interrupt, which is initialized in setup
for (size_type i=3; i<regmap_size; ++i)
write_reg(cu_addr + (i<<2),read_reg(regmap_addr + (i<<2)));
// write register map, starting at base + 0x10
// 0x4, 0x8, 0xc used for interrupt, which is initialized in setup
for (size_type idx = 4; idx < regmap_size; ++idx)
write_reg(cu_addr + (idx << 2), read_reg(regmap_addr + (idx << 2)));

// start kernel at base + 0x0
write_reg(cu_addr, 0x1);
}

/**
* Configure CU with address value pairs (out-of-order)
*/
inline void
configure_cu_ooo(addr_type cu_addr, addr_type regmap_addr, size_type regmap_size)
{
// write register map addr, value pairs starting at 0x10
for (size_type idx = 4; idx < regmap_size; idx += 2) {
addr_type offset = read_reg(regmap_addr + (idx << 2));
value_type value = read_reg(regmap_addr + ((idx + 1) << 2));
write_reg(offset, value);
}

// start kernel at base + 0x0
write_reg(cu_addr,0x1);
write_reg(cu_addr, 0x1);
}

/**
Expand Down Expand Up @@ -711,16 +740,18 @@ start_cu(size_type slot_idx)
if (cus.test(cu_idx) && !cu_status.test(cu_idx)) {
ERT_DEBUGF("start_cu cu(%d) for slot_idx(%d)\n",cu_idx,slot_idx);
ERT_ASSERT(read_reg(cu_idx_to_addr(cu_idx))==AP_IDLE,"cu not ready");
// cudma in 5.1 DSAs has a bug and supports at most 127 word copy
// excluding the 4 control words
if (cu_dma_enabled && (cu_dma_52 || regmap_size(slot.header_value)<(127+4))) {
// hardware transfer and start

if (slot.opcode==ERT_EXEC_WRITE)
// Out of order configuration
configure_cu_ooo(cu_idx_to_addr(cu_idx),slot.regmap_addr,slot.regmap_size);
else if (cu_dma_enabled && (cu_dma_52 || regmap_size(slot.header_value)<(127+4)))
// Use CUDMA and adjust for 5.1 DSAs that have a bug and supports
// at most 127 word copy excluding the 4 control words
configure_cu_dma(cu_idx,slot_idx,slot.slot_addr);
}
else {
else
// manually configure and start cu
configure_cu(cu_idx_to_addr(cu_idx),slot.regmap_addr,slot.regmap_size);
}

cu_status.toggle(cu_idx); // toggle cu status bit, it is now busy
set_cu_info(cu_idx,slot_idx); // record which slot cu associated with
return cu_idx;
Expand Down Expand Up @@ -993,9 +1024,11 @@ new_to_queued(size_type slot_idx)
auto& slot = command_slots[slot_idx];
ERT_ASSERT((slot.header_value & 0xF)==0x1,"slot is not new\n");

auto opc = opcode(slot.header_value);
ERT_DEBUGF("slot_idx(%d) opcode = %d\n",slot_idx,opc);
if (opc!=ERT_START_KERNEL) { // Non performance critical command
auto cmt = cmd_type(slot.header_value);
auto opc = slot.opcode = opcode(slot.header_value);
ERT_DEBUGF("slot_idx(%d) type(%d) opcode(%d)\n",slot_idx,cmt,opc);

if (cmt != ERT_CU) {
process_special_command(opc,slot_idx);
return false;
}
Expand Down
3 changes: 3 additions & 0 deletions src/runtime_src/xocl/api/xlnx/xclGetComputeUnitInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ xclGetComputeUnitInfo(cl_kernel kernel,
case XCL_COMPUTE_UNIT_INDEX:
buffer.as<cl_uint>() = cu->get_index();
break;
case XCL_COMPUTE_UNIT_BASE_ADDRESS:
buffer.as<size_t>() = cu->get_base_addr();
break;
case XCL_COMPUTE_UNIT_CONNECTIONS: {
int argidx = 0;
for (auto& arg : symbol->arguments) {
Expand Down
5 changes: 5 additions & 0 deletions src/runtime_src/xrt/scheduler/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ void
command::
execute()
{
// command objects can be reused outside constructor
// reset state
auto epacket = get_ert_cmd<ert_packet*>();
epacket->state = ERT_CMD_STATE_NEW;

m_done=false;
xrt::scheduler::schedule(get_ptr());
}
Expand Down
7 changes: 7 additions & 0 deletions src/runtime_src/xrt/xrt++/xrtexec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ completed() const
return m_impl->completed();
}

ert_cmd_state
command::
state() const
{
return static_cast<ert_cmd_state>(m_impl->ecmd->state);
}

exec_write_command::
exec_write_command(xrt_device* device)
: command(device,ERT_EXEC_WRITE)
Expand Down
3 changes: 3 additions & 0 deletions src/runtime_src/xrt/xrt++/xrtexec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ class command

bool
completed() const;

ert_cmd_state
state() const;
};

/**
Expand Down
119 changes: 88 additions & 31 deletions tests/unit_test/experimental/exec_write/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@
#include <iostream>
#include <thread>
#include <chrono>
#include <mutex>
#include <cstdarg>
#include <cstdio>
#include <cassert>

#define LENGTH (20)

Expand All @@ -78,44 +82,98 @@ throw_if_error(cl_int errcode, const std::string& msg)
throw_if_error(errcode,msg.c_str());
}

namespace debug {

static std::mutex s_debug_mutex;

struct lock
{
std::lock_guard<std::mutex> m_lk;
lock() : m_lk(s_debug_mutex)
{}
};

static void
printf(const char* format,...)
{
lock lk;
va_list args;
va_start(args,format);
vprintf(format,args);
va_end(args);
}

}

// Configure number of jobs to run
static const size_t num_jobs = 10;

// Configure how long to iterate the jobs
static const size_t seconds = 15;
static const size_t mseconds = 1000;

// Flag to stop job rescheduling. Is set to true after
// specified number of seconds.
static bool stop = false;

// Print sync mutex
static std::mutex print_mutex;

// A job schedules and runs a kernel using the exec_write command
// All jobs share same CU, but has seperate ddr location for result
// All jobs run as fast as they can, scheduler handles CU scheduling
struct job_type
{
size_t id = 0;
size_t runs = 0;
xrt_device* m_xdev;
uint32_t m_cuidx;
uint64_t m_bo_dev_addr;
size_t id = 0; // unique id for this job
size_t runs = 0; // how many runs this job completed
xrt_device* m_xdev; // handle to lower level xrt device
uint32_t m_cuidx; // index of cu to use
size_t m_cuaddr; // cu base address added to regmap offset

xrtcpp::exec::exec_write_command m_cmd;
cl_mem m_mem; // memory object for kernel write
uint64_t m_bo_dev_addr; // physical device ddr address of mem

job_type(xrt_device* xdev, uint32_t cuidx, uint64_t bo_dev_addr)
: m_xdev(xdev), m_cuidx(cuidx), m_bo_dev_addr(bo_dev_addr)
, m_cmd(xrtcpp::exec::exec_write_command(xdev))
cl_command_queue m_queue; // for enqueue operations

xrtcpp::exec::exec_write_command m_cmd; // exec_write command object

job_type(cl_context context, cl_device_id device, cl_command_queue queue, xrt_device* xdev, uint32_t cuidx, size_t cuaddr)
: m_xdev(xdev), m_cuidx(cuidx), m_cuaddr(cuaddr)
, m_mem(nullptr), m_bo_dev_addr(0)
, m_queue(queue)
, m_cmd(xrtcpp::exec::exec_write_command(m_xdev))
{
static size_t count = 0;
id = count++;

// Create a buffer for the verify kernel and get dbuf address
cl_int err = CL_SUCCESS;
m_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(char) * LENGTH, nullptr,&err);
throw_if_error(err,"failed to create kernel output buffer");
throw_if_error(xclGetMemObjDeviceAddress(m_mem,device,sizeof(uint64_t),&m_bo_dev_addr),"failed to get dbuf address");

// No indirect migration so force it
throw_if_error(clEnqueueMigrateMemObjects(m_queue,1,&m_mem,0,0,nullptr,nullptr),"failed to migrate");
clFinish(m_queue);
}

~job_type()
{
clReleaseMemObject(m_mem);
}

void
run()
{
while (!stop) {
m_cmd.clear();
m_cmd.add(XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA,m_bo_dev_addr); // low
m_cmd.add(XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA+4,(m_bo_dev_addr >> 32) & 0xFFFFFFFF); // high part of a
for (uint32_t offset = 0x10; offset < XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA; offset += 4)
m_cmd.add(m_cuaddr + offset,0);
m_cmd.add(m_cuaddr + XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA,m_bo_dev_addr); // low
m_cmd.add(m_cuaddr + XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA+4,(m_bo_dev_addr >> 32) & 0xFFFFFFFF); // high part of a
m_cmd.add_cu(m_cuidx);
m_cmd.execute();
m_cmd.wait();
assert(m_cmd.state() == ERT_CMD_STATE_COMPLETED);

// execute same command again demo completed() API busy wait
int count = 0;
Expand All @@ -124,19 +182,24 @@ struct job_type

runs += 2;
}

// Verify result
char hbuf[LENGTH] = {0};
throw_if_error(clEnqueueReadBuffer(m_queue,m_mem,CL_TRUE,0,sizeof(char)*LENGTH,hbuf,0,nullptr,nullptr),"failed to read");
debug::printf("job[%d] daddr(0x%p) result = %s\n",id,m_bo_dev_addr,hbuf);
}
};

static int
run_kernel(xrt_device* xdev, uint32_t cuidx, uint64_t bo_dev_addr)
run_kernel(cl_context context, cl_device_id device, cl_command_queue queue, xrt_device* xdev, uint32_t cuidx, size_t cuaddr)
{
xrtcpp::acquire_cu_context(xdev,cuidx);

// create jobs
std::vector<job_type> jobs;
jobs.reserve(num_jobs);
for (size_t j=0; j<num_jobs; ++j)
jobs.emplace_back(xdev,cuidx,bo_dev_addr);
jobs.emplace_back(context,device,queue,xdev,cuidx,cuaddr);

// each job runs on its own thread
auto launch = [](job_type& j) {
Expand All @@ -148,7 +211,7 @@ run_kernel(xrt_device* xdev, uint32_t cuidx, uint64_t bo_dev_addr)
for (auto& j : jobs)
workers.emplace_back(std::thread(launch,std::ref(j)));

std::this_thread::sleep_for(std::chrono::seconds(seconds));
std::this_thread::sleep_for(std::chrono::milliseconds(mseconds));
stop=true;

for (auto& t : workers)
Expand All @@ -167,13 +230,6 @@ run_test(cl_device_id device, cl_program program, cl_context context, cl_command
{
cl_int err = 0;

// Create a buffer for the verify kernel and get dbuf address
auto mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(char) * LENGTH, nullptr,&err);
throw_if_error(err,"failed to create kernel output buffer");
uint64_t dbuf = 0;
throw_if_error(xclGetMemObjDeviceAddress(mem,device,sizeof(uint64_t),&dbuf),"failed to get dbuf address");
throw_if_error(clEnqueueMigrateMemObjects(queue,1,&mem,0,0,nullptr,nullptr),"failed to migrate");

// Create kernel to get cu index to use with exec_write
auto kernel = clCreateKernel(program, "hello", &err);
throw_if_error(err,"failed to create hello kernel");
Expand All @@ -184,17 +240,15 @@ run_test(cl_device_id device, cl_program program, cl_context context, cl_command
cl_uint cuidx; // retrieve index of first cu in kernel
throw_if_error(xclGetComputeUnitInfo(kernel,0,XCL_COMPUTE_UNIT_INDEX,sizeof(cuidx),&cuidx,nullptr),"info index failed");

size_t cuaddr;
throw_if_error(xclGetComputeUnitInfo(kernel,0,XCL_COMPUTE_UNIT_BASE_ADDRESS,sizeof(cuaddr),&cuaddr,nullptr),"info addr failed");

// Get handle to underlying xrt_device
auto xdev = xclGetXrtDevice(device,&err);
throw_if_error(err,"failed to get xrt_device");

// Now run the kernel using the low level exec write command interface
auto ret = run_kernel(xdev,cuidx,dbuf);

// Verify the result
char hbuf[LENGTH] = {0};
throw_if_error(clEnqueueReadBuffer(queue,mem,CL_TRUE,0,sizeof(char)*LENGTH,hbuf,0,nullptr,nullptr),"failed to read");
std::cout << "kernel result: " << hbuf << "\n";
auto ret = run_kernel(context,device,queue,xdev,cuidx,cuaddr);

////////////////////////////////////////////////////////////////
// Unrelated code demoing xclGetComputeUnitInfo
Expand All @@ -211,17 +265,20 @@ run_test(cl_device_id device, cl_program program, cl_context context, cl_command
cl_uint cuidx;
throw_if_error
(xclGetComputeUnitInfo(kernel,cuid,XCL_COMPUTE_UNIT_INDEX,sizeof(cuidx),&cuidx,nullptr),"info index failed");
size_t cuaddr;
throw_if_error
(xclGetComputeUnitInfo(kernel,cuid,XCL_COMPUTE_UNIT_BASE_ADDRESS,sizeof(cuaddr),&cuaddr,nullptr),"info addr failed");
std::vector<cl_ulong> cumem(numargs);
throw_if_error
(xclGetComputeUnitInfo(kernel,cuid,XCL_COMPUTE_UNIT_CONNECTIONS,sizeof(cl_ulong)*numargs,cumem.data(),nullptr),"info conn failed");
std::cout << " cu[" << cuid << "].name = " << cunm << "\n";
std::cout << " cu[" << cuid << "].idx = " << cuidx << "\n";
std::cout << " cu[" << cuid << "].idx = " << cuidx << "\n";
std::cout << " cu[" << cuid << "].addr = 0x" << std::hex << cuaddr << std::dec << "\n";
for (auto memidx : cumem)
std::cout << " cu[" << cuid << "].mem = 0x" << std::hex << memidx << std::dec << "\n";
std::cout << " cu[" << cuid << "].mem = 0x" << std::hex << memidx << std::dec << "\n";
}

clReleaseKernel(kernel);
clReleaseMemObject(mem);

return ret;
}
Expand Down
2 changes: 0 additions & 2 deletions tests/unit_test/experimental/exec_write/sdaccel.ini

This file was deleted.

0 comments on commit 311ef77

Please sign in to comment.