Skip to content

Commit

Permalink
drm/xe/pf: Track adverse events notifications from GuC
Browse files Browse the repository at this point in the history
When thresholds used to monitor VFs activities are configured,
then GuC may send GUC2PF_ADVERSE_EVENT messages informing the
PF driver about exceeded thresholds. Start handling such messages.

Reviewed-by: Piotr Piórkowski <[email protected]>
Signed-off-by: Michal Wajdeczko <[email protected]>
Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
  • Loading branch information
mwajdecz committed May 16, 2024
1 parent d5e12ff commit 335d62a
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 0 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/xe/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ xe-$(CONFIG_PCI_IOV) += \
xe_gt_sriov_pf_config.o \
xe_gt_sriov_pf_control.o \
xe_gt_sriov_pf_debugfs.o \
xe_gt_sriov_pf_monitor.o \
xe_gt_sriov_pf_policy.o \
xe_gt_sriov_pf_service.o \
xe_lmtt.o \
Expand Down
147 changes: 147 additions & 0 deletions drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// SPDX-License-Identifier: MIT
/*
* Copyright © 2023-2024 Intel Corporation
*/

#include "abi/guc_actions_sriov_abi.h"
#include "abi/guc_messages_abi.h"

#include "xe_gt_sriov_pf_config.h"
#include "xe_gt_sriov_pf_helpers.h"
#include "xe_gt_sriov_pf_monitor.h"
#include "xe_gt_sriov_printk.h"
#include "xe_guc_klv_helpers.h"
#include "xe_guc_klv_thresholds_set.h"

/**
* xe_gt_sriov_pf_monitor_flr - Cleanup VF data after VF FLR.
* @gt: the &xe_gt
* @vfid: the VF identifier
*
* On FLR this function will reset all event data related to the VF.
* This function is for PF only.
*/
void xe_gt_sriov_pf_monitor_flr(struct xe_gt *gt, u32 vfid)
{
int e;

xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt)));
xe_gt_sriov_pf_assert_vfid(gt, vfid);

for (e = 0; e < XE_GUC_KLV_NUM_THRESHOLDS; e++)
gt->sriov.pf.vfs[vfid].monitor.guc.events[e] = 0;
}

static void pf_update_event_counter(struct xe_gt *gt, u32 vfid,
enum xe_guc_klv_threshold_index e)
{
xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt)));
xe_gt_assert(gt, e < XE_GUC_KLV_NUM_THRESHOLDS);

gt->sriov.pf.vfs[vfid].monitor.guc.events[e]++;
}

static int pf_handle_vf_threshold_event(struct xe_gt *gt, u32 vfid, u32 threshold)
{
char origin[8];
int e;

e = xe_guc_klv_threshold_key_to_index(threshold);
xe_sriov_function_name(vfid, origin, sizeof(origin));

/* was there a new KEY added that we missed? */
if (unlikely(e < 0)) {
xe_gt_sriov_notice(gt, "unknown threshold key %#x reported for %s\n",
threshold, origin);
return -ENOTCONN;
}

xe_gt_sriov_dbg(gt, "%s exceeded threshold %u %s\n",
origin, xe_gt_sriov_pf_config_get_threshold(gt, vfid, e),
xe_guc_klv_key_to_string(threshold));

pf_update_event_counter(gt, vfid, e);

return 0;
}

/**
* xe_gt_sriov_pf_monitor_process_guc2pf - Handle adverse event notification from the GuC.
* @gt: the &xe_gt
* @msg: G2H event message
* @len: length of the message
*
* This function is intended for PF only.
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len)
{
struct xe_device *xe = gt_to_xe(gt);
u32 vfid;
u32 threshold;

xe_gt_assert(gt, len >= GUC_HXG_MSG_MIN_LEN);
xe_gt_assert(gt, FIELD_GET(GUC_HXG_MSG_0_ORIGIN, msg[0]) == GUC_HXG_ORIGIN_GUC);
xe_gt_assert(gt, FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[0]) == GUC_HXG_TYPE_EVENT);
xe_gt_assert(gt, FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[0]) ==
GUC_ACTION_GUC2PF_ADVERSE_EVENT);

if (unlikely(!IS_SRIOV_PF(xe)))
return -EPROTO;

if (unlikely(FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_0_MBZ, msg[0])))
return -EPFNOSUPPORT;

if (unlikely(len < GUC2PF_ADVERSE_EVENT_EVENT_MSG_LEN))
return -EPROTO;

vfid = FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_1_VFID, msg[1]);
threshold = FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_2_THRESHOLD, msg[2]);

if (unlikely(vfid > xe_gt_sriov_pf_get_totalvfs(gt)))
return -EINVAL;

return pf_handle_vf_threshold_event(gt, vfid, threshold);
}

/**
* xe_gt_sriov_pf_monitor_print_events - Print adverse events counters.
* @gt: the &xe_gt to print events from
* @p: the &drm_printer
*
* Print adverse events counters for all VFs.
* VFs with no events are not printed.
*
* This function can only be called on PF.
*/
void xe_gt_sriov_pf_monitor_print_events(struct xe_gt *gt, struct drm_printer *p)
{
unsigned int n, total_vfs = xe_gt_sriov_pf_get_totalvfs(gt);
const struct xe_gt_sriov_monitor *data;
int e;

xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt)));

for (n = 1; n <= total_vfs; n++) {
data = &gt->sriov.pf.vfs[n].monitor;

for (e = 0; e < XE_GUC_KLV_NUM_THRESHOLDS; e++)
if (data->guc.events[e])
break;

/* skip empty unless in debug mode */
if (e >= XE_GUC_KLV_NUM_THRESHOLDS &&
!IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV))
continue;

#define __format(...) "%s:%u "
#define __value(TAG, NAME, ...) , #NAME, data->guc.events[MAKE_XE_GUC_KLV_THRESHOLD_INDEX(TAG)]

drm_printf(p, "VF%u:\t" MAKE_XE_GUC_KLV_THRESHOLDS_SET(__format) "\n",
n MAKE_XE_GUC_KLV_THRESHOLDS_SET(__value));

#undef __format
#undef __value
}
}
27 changes: 27 additions & 0 deletions drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright © 2023-2024 Intel Corporation
*/

#ifndef _XE_GT_SRIOV_PF_MONITOR_H_
#define _XE_GT_SRIOV_PF_MONITOR_H_

#include <linux/errno.h>
#include <linux/types.h>

struct drm_printer;
struct xe_gt;

void xe_gt_sriov_pf_monitor_flr(struct xe_gt *gt, u32 vfid);
void xe_gt_sriov_pf_monitor_print_events(struct xe_gt *gt, struct drm_printer *p);

#ifdef CONFIG_PCI_IOV
int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len);
#else
static inline int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len)
{
return -EPROTO;
}
#endif

#endif
22 changes: 22 additions & 0 deletions drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright © 2023-2024 Intel Corporation
*/

#ifndef _XE_GT_SRIOV_PF_MONITOR_TYPES_H_
#define _XE_GT_SRIOV_PF_MONITOR_TYPES_H_

#include "xe_guc_klv_thresholds_set_types.h"

/**
* struct xe_gt_sriov_monitor - GT level per-VF monitoring data.
*/
struct xe_gt_sriov_monitor {
/** @guc: monitoring data related to the GuC. */
struct {
/** @guc.events: number of adverse events reported by the GuC. */
unsigned int events[XE_GUC_KLV_NUM_THRESHOLDS];
} guc;
};

#endif
5 changes: 5 additions & 0 deletions drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <linux/types.h>

#include "xe_gt_sriov_pf_config_types.h"
#include "xe_gt_sriov_pf_monitor_types.h"
#include "xe_gt_sriov_pf_policy_types.h"
#include "xe_gt_sriov_pf_service_types.h"

Expand All @@ -18,6 +19,10 @@
struct xe_gt_sriov_metadata {
/** @config: per-VF provisioning data. */
struct xe_gt_sriov_config config;

/** @monitor: per-VF monitoring data. */
struct xe_gt_sriov_monitor monitor;

/** @version: negotiated VF/PF ABI version */
struct xe_gt_sriov_pf_service_version version;
};
Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/xe/xe_guc_ct.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "xe_gt_pagefault.h"
#include "xe_gt_printk.h"
#include "xe_gt_sriov_pf_control.h"
#include "xe_gt_sriov_pf_monitor.h"
#include "xe_gt_tlb_invalidation.h"
#include "xe_guc.h"
#include "xe_guc_relay.h"
Expand Down Expand Up @@ -1071,6 +1072,9 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY:
ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len);
break;
case GUC_ACTION_GUC2PF_ADVERSE_EVENT:
ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len);
break;
default:
xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action);
}
Expand Down

0 comments on commit 335d62a

Please sign in to comment.