diff --git a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md
index 581aa9b9e25ea..d7022d11ac5a6 100644
--- a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md
+++ b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md
@@ -279,13 +279,14 @@ All trace point types in bold provide semantic information about the graph, node
| `sym_source_file_name` | C-style string | Source file name |
| `sym_line_no` | `int32_t` | File line number |
| `sym_column_no` | `int32_t` | File column number |
-
+| `enqueue_kernel_data` | `xpti::offload_kernel_arg_data_t` | Includes kernel execution parameters (global size, local size, offset) and number of kernel arguments |
+| `argN` | `xpti::offload_kernel_arg_data_t` | Description for the Nth kernel argument. It includes argument kind (sycl::detail::kernel_param_kind_t), pointer to the value, size and index in the argument list. |
## Buffer management stream `"sycl.experimental.buffer"` Notification Signatures
| Trace Point Type | Parameter Description | Metadata |
| :------------------------: | :-------------------- | :------- |
-| `offload_alloc_construct` |
**trace_type**: `xpti::trace_point_type_t::offload_buffer_data_t` that marks offload buffer creation point **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer. **event**: `nullptr` - since the stream of data just captures functions being called. **instance**: `nullptr` since no begin-end event alignment is needed. **user_data**: A pointer to `offload_buffer_data_t` object, that includes buffer object ID, host pointer used to create/initialize buffer, buffer element information (type name, size), number of buffer dimensions and buffer size for each dimension. | None |
+| `offload_alloc_construct` | **trace_type**: `xpti::trace_point_type_t::offload_buffer_data_t` that marks offload buffer creation point **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer. **event**: `xpti::trace_event_data_t` - contains information about source location. **instance**: `nullptr` since no begin-end event alignment is needed. **user_data**: A pointer to `offload_buffer_data_t` object, that includes buffer object ID, host pointer used to create/initialize buffer, buffer element information (type name, size), number of buffer dimensions and buffer size for each dimension. | None |
| `offload_alloc_associate` | **trace_type**: `xpti::trace_point_type_t::offload_buffer_association_data_t` that provides association between user level buffer object and platform specific memory object **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer. **event**: `nullptr` - since the stream of data just captures functions being called. **instance**: `nullptr` since no begin-end event alignment is needed. **user_data**: A pointer to `offload_buffer_association_data_t` object, that includes user object ID and platform-specific representation for offload buffer. | None |
| `offload_alloc_destruct` | **trace_type**: `xpti::trace_point_type_t::offload_buffer_data_t` that marks offload buffer destruction point **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer. **event**: `nullptr` - since the stream of data just captures functions being called. **instance**: `nullptr` since no begin-end event alignment is needed. **user_data**: A pointer to `offload_buffer_data_t` object, that includes buffer object ID. | None |
| `offload_alloc_release` | **trace_type**: `xpti::trace_point_type_t::offload_buffer_release_data_t` that provides information about release of platform specific memory object **parent**: `nullptr` - since the stream of data just captures functions being called. **event**: `nullptr` - since the stream of data just captures functions being called. **instance**: `nullptr` since no begin-end event alignment is needed. **user_data**: A pointer to `offload_buffer_association_data_t` object, that includes user object ID and platform-specific representation for offload buffer. | None |
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index f9b36419f6294..6138b21723a57 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -86,6 +86,41 @@ static std::string deviceToString(device Device) {
return "UNKNOWN";
}
+static void applyFuncOnFilteredArgs(
+ const ProgramManager::KernelArgMask &EliminatedArgMask,
+ std::vector &Args,
+ std::function Func) {
+ if (EliminatedArgMask.empty()) {
+ for (ArgDesc &Arg : Args) {
+ Func(Arg, Arg.MIndex);
+ }
+ } else {
+ // TODO this is not necessary as long as we can guarantee that the
+ // arguments are already sorted (e. g. handle the sorting in handler
+ // if necessary due to set_arg(...) usage).
+ std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
+ return A.MIndex < B.MIndex;
+ });
+ int LastIndex = -1;
+ size_t NextTrueIndex = 0;
+
+ for (ArgDesc &Arg : Args) {
+ // Handle potential gaps in set arguments (e. g. if some of them are
+ // set on the user side).
+ for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
+ if (!EliminatedArgMask[Idx])
+ ++NextTrueIndex;
+ LastIndex = Arg.MIndex;
+
+ if (EliminatedArgMask[Arg.MIndex])
+ continue;
+
+ Func(Arg, NextTrueIndex);
+ ++NextTrueIndex;
+ }
+ }
+}
+
#ifdef XPTI_ENABLE_INSTRUMENTATION
static size_t deviceToID(const device &Device) {
if (Device.is_host())
@@ -1779,6 +1814,73 @@ void ExecCGCommand::emitInstrumentationData() {
xpti::addMetadata(CmdTraceEvent, "sym_column_no", MCommandGroup->MColumn);
}
+ if (MCommandGroup->getType() == detail::CG::Kernel) {
+ auto KernelCG =
+ reinterpret_cast(MCommandGroup.get());
+ auto &NDRDesc = KernelCG->MNDRDesc;
+ std::vector Args;
+
+ auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
+ Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
+ };
+ RT::PiProgram Program = nullptr;
+ RT::PiKernel Kernel = nullptr;
+ std::mutex *KernelMutex = nullptr;
+
+ std::shared_ptr SyclKernelImpl;
+ std::shared_ptr DeviceImageImpl;
+ auto KernelBundleImplPtr = KernelCG->getKernelBundle();
+
+ // Use kernel_bundle if available unless it is interop.
+ // Interop bundles can't be used in the first branch, because the kernels
+ // in interop kernel bundles (if any) do not have kernel_id
+ // and can therefore not be looked up, but since they are self-contained
+ // they can simply be launched directly.
+ if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
+ kernel_id KernelID =
+ detail::ProgramManager::getInstance().getSYCLKernelID(
+ KernelCG->MKernelName);
+ kernel SyclKernel =
+ KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
+ Program = detail::getSyclObjImpl(SyclKernel)
+ ->getDeviceImage()
+ ->get_program_ref();
+ } else if (nullptr != KernelCG->MSyclKernel) {
+ auto SyclProg = detail::getSyclObjImpl(
+ KernelCG->MSyclKernel->get_info());
+ Program = SyclProg->getHandleRef();
+ } else {
+ std::tie(Kernel, KernelMutex, Program) =
+ detail::ProgramManager::getInstance().getOrCreateKernel(
+ KernelCG->MOSModuleHandle, MQueue->getContextImplPtr(),
+ MQueue->getDeviceImplPtr(), KernelCG->MKernelName, nullptr);
+ }
+
+ ProgramManager::KernelArgMask EliminatedArgMask;
+ if (nullptr == KernelCG->MSyclKernel ||
+ !KernelCG->MSyclKernel->isCreatedFromSource()) {
+ EliminatedArgMask =
+ detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
+ KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName);
+ }
+
+ applyFuncOnFilteredArgs(EliminatedArgMask, KernelCG->MArgs, FilterArgs);
+
+ xpti::offload_kernel_enqueue_data_t KernelData{
+ {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
+ {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
+ {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
+ NDRDesc.GlobalOffset[2]},
+ Args.size()};
+ xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
+ for (size_t i = 0; i < Args.size(); i++) {
+ std::string Prefix("arg");
+ xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
+ Args[i].MSize, Args[i].MIndex};
+ xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
+ }
+ }
+
xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
detail::GSYCLGraphEvent, CmdTraceEvent,
CGKernelInstanceNo,
@@ -1937,35 +2039,7 @@ static pi_result SetKernelParamsAndLaunch(
}
};
- if (EliminatedArgMask.empty()) {
- for (ArgDesc &Arg : Args) {
- setFunc(Arg, Arg.MIndex);
- }
- } else {
- // TODO this is not necessary as long as we can guarantee that the arguments
- // are already sorted (e. g. handle the sorting in handler if necessary due
- // to set_arg(...) usage).
- std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
- return A.MIndex < B.MIndex;
- });
- int LastIndex = -1;
- size_t NextTrueIndex = 0;
-
- for (ArgDesc &Arg : Args) {
- // Handle potential gaps in set arguments (e. g. if some of them are set
- // on the user side).
- for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
- if (!EliminatedArgMask[Idx])
- ++NextTrueIndex;
- LastIndex = Arg.MIndex;
-
- if (EliminatedArgMask[Arg.MIndex])
- continue;
-
- setFunc(Arg, NextTrueIndex);
- ++NextTrueIndex;
- }
- }
+ applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
@@ -2380,15 +2454,16 @@ cl_int ExecCGCommand::enqueueImp() {
Plugin.call(RawEvents.size(), &RawEvents[0]);
}
std::vector ReqMemObjs;
- // Extract the Mem Objects for all Requirements, to ensure they are available if
- // a user ask for them inside the interop task scope
- const auto& HandlerReq = ExecInterop->MRequirements;
- std::for_each(std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement* Req) {
- AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
- auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation());
- interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg);
- ReqMemObjs.emplace_back(ReqToMem);
- });
+ // Extract the Mem Objects for all Requirements, to ensure they are
+ // available if a user ask for them inside the interop task scope
+ const auto &HandlerReq = ExecInterop->MRequirements;
+ std::for_each(
+ std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement *Req) {
+ AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
+ auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation());
+ interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg);
+ ReqMemObjs.emplace_back(ReqToMem);
+ });
std::sort(std::begin(ReqMemObjs), std::end(ReqMemObjs));
interop_handler InteropHandler(std::move(ReqMemObjs), MQueue);
diff --git a/xpti/include/xpti/xpti_data_types.h b/xpti/include/xpti/xpti_data_types.h
index 40348aa10ed00..5b2b3f3906c0c 100644
--- a/xpti/include/xpti/xpti_data_types.h
+++ b/xpti/include/xpti/xpti_data_types.h
@@ -394,7 +394,7 @@ enum class trace_point_type_t : uint16_t {
offload_alloc_destruct = XPTI_TRACE_POINT_BEGIN(22),
/// Used to notify about releasing internal handle for offload buffer
offload_alloc_release = XPTI_TRACE_POINT_BEGIN(23),
- /// Used to notify about creation accessor for ofload buffer
+ /// Used to notify about creation accessor for offload buffer
offload_alloc_accessor = XPTI_TRACE_POINT_BEGIN(24),
/// Indicates that the trace point is user defined and only the tool defined
/// for a stream will be able to handle it
@@ -569,6 +569,31 @@ struct offload_buffer_association_data_t {
/// A pointer to platform specific handler for the offload object
uintptr_t mem_object_handle = 0;
};
+
+/// Describes enqueued kernel object
+struct offload_kernel_enqueue_data_t {
+ /// Global size
+ size_t global_size[3] = {0, 0, 0};
+ /// Local size
+ size_t local_size[3] = {0, 0, 0};
+ /// Offset
+ size_t offset[3] = {0, 0, 0};
+ /// Number of kernel arguments
+ size_t args_num = 0;
+};
+
+/// Describes enqueued kernel argument
+struct offload_kernel_arg_data_t {
+ /// Argument type as set in kernel_param_kind_t
+ int type = -1;
+ /// Pointer to the data
+ void *pointer = nullptr;
+ /// Size of the argument
+ int size = 0;
+ /// Index of the argument in the kernel
+ int index = 0;
+};
+
/// Describes memory allocation
struct mem_alloc_data_t {
/// A platform-specific memory object handle. Some heterogeneous programming