diff --git a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md index 581aa9b9e25ea..d7022d11ac5a6 100644 --- a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md +++ b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md @@ -279,13 +279,14 @@ All trace point types in bold provide semantic information about the graph, node | `sym_source_file_name` | C-style string | Source file name | | `sym_line_no` | `int32_t` | File line number | | `sym_column_no` | `int32_t` | File column number | - +| `enqueue_kernel_data` | `xpti::offload_kernel_arg_data_t` | Includes kernel execution parameters (global size, local size, offset) and number of kernel arguments | +| `argN` | `xpti::offload_kernel_arg_data_t` | Description for the Nth kernel argument. It includes argument kind (sycl::detail::kernel_param_kind_t), pointer to the value, size and index in the argument list. | ## Buffer management stream `"sycl.experimental.buffer"` Notification Signatures | Trace Point Type | Parameter Description | Metadata | | :------------------------: | :-------------------- | :------- | -| `offload_alloc_construct` |
  • **trace_type**: `xpti::trace_point_type_t::offload_buffer_data_t` that marks offload buffer creation point
  • **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer.
  • **event**: `nullptr` - since the stream of data just captures functions being called.
  • **instance**: `nullptr` since no begin-end event alignment is needed.
  • **user_data**: A pointer to `offload_buffer_data_t` object, that includes buffer object ID, host pointer used to create/initialize buffer, buffer element information (type name, size), number of buffer dimensions and buffer size for each dimension.
  • | None | +| `offload_alloc_construct` |
  • **trace_type**: `xpti::trace_point_type_t::offload_buffer_data_t` that marks offload buffer creation point
  • **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer.
  • **event**: `xpti::trace_event_data_t` - contains information about source location.
  • **instance**: `nullptr` since no begin-end event alignment is needed.
  • **user_data**: A pointer to `offload_buffer_data_t` object, that includes buffer object ID, host pointer used to create/initialize buffer, buffer element information (type name, size), number of buffer dimensions and buffer size for each dimension.
  • | None | | `offload_alloc_associate` |
  • **trace_type**: `xpti::trace_point_type_t::offload_buffer_association_data_t` that provides association between user level buffer object and platform specific memory object
  • **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer.
  • **event**: `nullptr` - since the stream of data just captures functions being called.
  • **instance**: `nullptr` since no begin-end event alignment is needed.
  • **user_data**: A pointer to `offload_buffer_association_data_t` object, that includes user object ID and platform-specific representation for offload buffer.
  • | None | | `offload_alloc_destruct` |
  • **trace_type**: `xpti::trace_point_type_t::offload_buffer_data_t` that marks offload buffer destruction point
  • **parent**: Event ID created for all functions in the `oneapi.experimental.buffer` layer.
  • **event**: `nullptr` - since the stream of data just captures functions being called.
  • **instance**: `nullptr` since no begin-end event alignment is needed.
  • **user_data**: A pointer to `offload_buffer_data_t` object, that includes buffer object ID.
  • | None | | `offload_alloc_release` |
  • **trace_type**: `xpti::trace_point_type_t::offload_buffer_release_data_t` that provides information about release of platform specific memory object
  • **parent**: `nullptr` - since the stream of data just captures functions being called.
  • **event**: `nullptr` - since the stream of data just captures functions being called.
  • **instance**: `nullptr` since no begin-end event alignment is needed.
  • **user_data**: A pointer to `offload_buffer_association_data_t` object, that includes user object ID and platform-specific representation for offload buffer.
  • | None | diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f9b36419f6294..6138b21723a57 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -86,6 +86,41 @@ static std::string deviceToString(device Device) { return "UNKNOWN"; } +static void applyFuncOnFilteredArgs( + const ProgramManager::KernelArgMask &EliminatedArgMask, + std::vector &Args, + std::function Func) { + if (EliminatedArgMask.empty()) { + for (ArgDesc &Arg : Args) { + Func(Arg, Arg.MIndex); + } + } else { + // TODO this is not necessary as long as we can guarantee that the + // arguments are already sorted (e. g. handle the sorting in handler + // if necessary due to set_arg(...) usage). + std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) { + return A.MIndex < B.MIndex; + }); + int LastIndex = -1; + size_t NextTrueIndex = 0; + + for (ArgDesc &Arg : Args) { + // Handle potential gaps in set arguments (e. g. if some of them are + // set on the user side). + for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx) + if (!EliminatedArgMask[Idx]) + ++NextTrueIndex; + LastIndex = Arg.MIndex; + + if (EliminatedArgMask[Arg.MIndex]) + continue; + + Func(Arg, NextTrueIndex); + ++NextTrueIndex; + } + } +} + #ifdef XPTI_ENABLE_INSTRUMENTATION static size_t deviceToID(const device &Device) { if (Device.is_host()) @@ -1779,6 +1814,73 @@ void ExecCGCommand::emitInstrumentationData() { xpti::addMetadata(CmdTraceEvent, "sym_column_no", MCommandGroup->MColumn); } + if (MCommandGroup->getType() == detail::CG::Kernel) { + auto KernelCG = + reinterpret_cast(MCommandGroup.get()); + auto &NDRDesc = KernelCG->MNDRDesc; + std::vector Args; + + auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) { + Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex}); + }; + RT::PiProgram Program = nullptr; + RT::PiKernel Kernel = nullptr; + std::mutex *KernelMutex = nullptr; + + std::shared_ptr SyclKernelImpl; + std::shared_ptr DeviceImageImpl; + auto KernelBundleImplPtr = KernelCG->getKernelBundle(); + + // Use kernel_bundle if available unless it is interop. + // Interop bundles can't be used in the first branch, because the kernels + // in interop kernel bundles (if any) do not have kernel_id + // and can therefore not be looked up, but since they are self-contained + // they can simply be launched directly. + if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID( + KernelCG->MKernelName); + kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); + Program = detail::getSyclObjImpl(SyclKernel) + ->getDeviceImage() + ->get_program_ref(); + } else if (nullptr != KernelCG->MSyclKernel) { + auto SyclProg = detail::getSyclObjImpl( + KernelCG->MSyclKernel->get_info()); + Program = SyclProg->getHandleRef(); + } else { + std::tie(Kernel, KernelMutex, Program) = + detail::ProgramManager::getInstance().getOrCreateKernel( + KernelCG->MOSModuleHandle, MQueue->getContextImplPtr(), + MQueue->getDeviceImplPtr(), KernelCG->MKernelName, nullptr); + } + + ProgramManager::KernelArgMask EliminatedArgMask; + if (nullptr == KernelCG->MSyclKernel || + !KernelCG->MSyclKernel->isCreatedFromSource()) { + EliminatedArgMask = + detail::ProgramManager::getInstance().getEliminatedKernelArgMask( + KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName); + } + + applyFuncOnFilteredArgs(EliminatedArgMask, KernelCG->MArgs, FilterArgs); + + xpti::offload_kernel_enqueue_data_t KernelData{ + {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]}, + {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]}, + {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1], + NDRDesc.GlobalOffset[2]}, + Args.size()}; + xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData); + for (size_t i = 0; i < Args.size(); i++) { + std::string Prefix("arg"); + xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr, + Args[i].MSize, Args[i].MIndex}; + xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg); + } + } + xptiNotifySubscribers(MStreamID, xpti::trace_node_create, detail::GSYCLGraphEvent, CmdTraceEvent, CGKernelInstanceNo, @@ -1937,35 +2039,7 @@ static pi_result SetKernelParamsAndLaunch( } }; - if (EliminatedArgMask.empty()) { - for (ArgDesc &Arg : Args) { - setFunc(Arg, Arg.MIndex); - } - } else { - // TODO this is not necessary as long as we can guarantee that the arguments - // are already sorted (e. g. handle the sorting in handler if necessary due - // to set_arg(...) usage). - std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) { - return A.MIndex < B.MIndex; - }); - int LastIndex = -1; - size_t NextTrueIndex = 0; - - for (ArgDesc &Arg : Args) { - // Handle potential gaps in set arguments (e. g. if some of them are set - // on the user side). - for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx) - if (!EliminatedArgMask[Idx]) - ++NextTrueIndex; - LastIndex = Arg.MIndex; - - if (EliminatedArgMask[Arg.MIndex]) - continue; - - setFunc(Arg, NextTrueIndex); - ++NextTrueIndex; - } - } + applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr())); @@ -2380,15 +2454,16 @@ cl_int ExecCGCommand::enqueueImp() { Plugin.call(RawEvents.size(), &RawEvents[0]); } std::vector ReqMemObjs; - // Extract the Mem Objects for all Requirements, to ensure they are available if - // a user ask for them inside the interop task scope - const auto& HandlerReq = ExecInterop->MRequirements; - std::for_each(std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement* Req) { - AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation()); - interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg); - ReqMemObjs.emplace_back(ReqToMem); - }); + // Extract the Mem Objects for all Requirements, to ensure they are + // available if a user ask for them inside the interop task scope + const auto &HandlerReq = ExecInterop->MRequirements; + std::for_each( + std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement *Req) { + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation()); + interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg); + ReqMemObjs.emplace_back(ReqToMem); + }); std::sort(std::begin(ReqMemObjs), std::end(ReqMemObjs)); interop_handler InteropHandler(std::move(ReqMemObjs), MQueue); diff --git a/xpti/include/xpti/xpti_data_types.h b/xpti/include/xpti/xpti_data_types.h index 40348aa10ed00..5b2b3f3906c0c 100644 --- a/xpti/include/xpti/xpti_data_types.h +++ b/xpti/include/xpti/xpti_data_types.h @@ -394,7 +394,7 @@ enum class trace_point_type_t : uint16_t { offload_alloc_destruct = XPTI_TRACE_POINT_BEGIN(22), /// Used to notify about releasing internal handle for offload buffer offload_alloc_release = XPTI_TRACE_POINT_BEGIN(23), - /// Used to notify about creation accessor for ofload buffer + /// Used to notify about creation accessor for offload buffer offload_alloc_accessor = XPTI_TRACE_POINT_BEGIN(24), /// Indicates that the trace point is user defined and only the tool defined /// for a stream will be able to handle it @@ -569,6 +569,31 @@ struct offload_buffer_association_data_t { /// A pointer to platform specific handler for the offload object uintptr_t mem_object_handle = 0; }; + +/// Describes enqueued kernel object +struct offload_kernel_enqueue_data_t { + /// Global size + size_t global_size[3] = {0, 0, 0}; + /// Local size + size_t local_size[3] = {0, 0, 0}; + /// Offset + size_t offset[3] = {0, 0, 0}; + /// Number of kernel arguments + size_t args_num = 0; +}; + +/// Describes enqueued kernel argument +struct offload_kernel_arg_data_t { + /// Argument type as set in kernel_param_kind_t + int type = -1; + /// Pointer to the data + void *pointer = nullptr; + /// Size of the argument + int size = 0; + /// Index of the argument in the kernel + int index = 0; +}; + /// Describes memory allocation struct mem_alloc_data_t { /// A platform-specific memory object handle. Some heterogeneous programming