Skip to content

Commit

Permalink
drm/xe: Force wedged state and block GT reset upon any GPU hang
Browse files Browse the repository at this point in the history
In many validation situations when debugging GPU Hangs,
it is useful to preserve the GT situation from the moment
that the timeout occurred.

This patch introduces a module parameter that could be used
on situations like this.

If xe.wedged module parameter is set to 2, Xe will be declared
wedged on every single execution timeout (a.k.a. GPU hang) right
after devcoredump snapshot capture and without attempting any
kind of GT reset and blocking entirely any kind of execution.

v2: Really block gt_reset from guc side. (Lucas)
    s/wedged/busted (Lucas)

v3: - s/busted/wedged
    - Really use global_flags (Dafna)
    - More robust timeout handling when wedging it.

v4: A really robust clean exit done by Matt Brost.
    No more kernel warns on unbind.

v5: Simplify error message (Lucas)

Cc: Matthew Brost <[email protected]>
Cc: Dafna Hirschfeld <[email protected]>
Cc: Lucas De Marchi <[email protected]>
Cc: Alan Previn <[email protected]>
Cc: Himanshu Somaiya <[email protected]>
Reviewed-by: Lucas De Marchi <[email protected]>
Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
Signed-off-by: Rodrigo Vivi <[email protected]>
  • Loading branch information
rodrigovivi committed Apr 24, 2024
1 parent 6928186 commit 8ed9aaa
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 31 deletions.
29 changes: 29 additions & 0 deletions drivers/gpu/drm/xe/xe_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -764,3 +764,32 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
{
return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
}

/**
* xe_device_declare_wedged - Declare device wedged
* @xe: xe device instance
*
* This is a final state that can only be cleared with a module
* re-probe (unbind + bind).
* In this state every IOCTL will be blocked so the GT cannot be used.
* In general it will be called upon any critical error such as gt reset
* failure or guc loading failure.
* If xe.wedged module parameter is set to 2, this function will be called
* on every single execution timeout (a.k.a. GPU hang) right after devcoredump
* snapshot capture. In this mode, GT reset won't be attempted so the state of
* the issue is preserved for further debugging.
*/
void xe_device_declare_wedged(struct xe_device *xe)
{
if (xe_modparam.wedged_mode == 0)
return;

if (!atomic_xchg(&xe->wedged, 1)) {
xe->needs_flr_on_fini = true;
drm_err(&xe->drm,
"CRITICAL: Xe has declared device %s as wedged.\n"
"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
dev_name(xe->drm.dev));
}
}
15 changes: 1 addition & 14 deletions drivers/gpu/drm/xe/xe_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
return atomic_read(&xe->wedged);
}

static inline void xe_device_declare_wedged(struct xe_device *xe)
{
if (!atomic_xchg(&xe->wedged, 1)) {
xe->needs_flr_on_fini = true;
drm_err(&xe->drm,
"CRITICAL: Xe has declared device %s as wedged.\n"
"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
dev_name(xe->drm.dev), dev_name(xe->drm.dev),
dev_name(xe->drm.dev));
}
}
void xe_device_declare_wedged(struct xe_device *xe);

#endif
9 changes: 9 additions & 0 deletions drivers/gpu/drm/xe/xe_exec_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q);
void xe_exec_queue_destroy(struct kref *ref);
void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance);

static inline struct xe_exec_queue *
xe_exec_queue_get_unless_zero(struct xe_exec_queue *q)
{
if (kref_get_unless_zero(&q->refcount))
return q;

return NULL;
}

struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id);

static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q)
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
return seqno;

xe_gt_tlb_invalidation_wait(gt, seqno);
} else if (xe_device_uc_enabled(xe)) {
} else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
Expand Down
9 changes: 8 additions & 1 deletion drivers/gpu/drm/xe/xe_guc_ads.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "xe_lrc.h"
#include "xe_map.h"
#include "xe_mmio.h"
#include "xe_module.h"
#include "xe_platform_types.h"
#include "xe_wa.h"

Expand Down Expand Up @@ -440,11 +441,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)

static void guc_policies_init(struct xe_guc_ads *ads)
{
u32 global_flags = 0;

ads_blob_write(ads, policies.dpc_promote_time,
GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
ads_blob_write(ads, policies.max_num_work_items,
GLOBAL_POLICY_MAX_NUM_WI);
ads_blob_write(ads, policies.global_flags, 0);

if (xe_modparam.wedged_mode == 2)
global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;

ads_blob_write(ads, policies.global_flags, global_flags);
ads_blob_write(ads, policies.is_valid, 1);
}

Expand Down
90 changes: 75 additions & 15 deletions drivers/gpu/drm/xe/xe_guc_submit.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "xe_macros.h"
#include "xe_map.h"
#include "xe_mocs.h"
#include "xe_module.h"
#include "xe_ring_ops_types.h"
#include "xe_sched_job.h"
#include "xe_trace.h"
Expand All @@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
#define ENGINE_STATE_SUSPENDED (1 << 5)
#define EXEC_QUEUE_STATE_RESET (1 << 6)
#define ENGINE_STATE_KILLED (1 << 7)
#define EXEC_QUEUE_STATE_WEDGED (1 << 8)

static bool exec_queue_registered(struct xe_exec_queue *q)
{
Expand Down Expand Up @@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
atomic_or(ENGINE_STATE_KILLED, &q->guc->state);
}

static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
static bool exec_queue_wedged(struct xe_exec_queue *q)
{
return exec_queue_killed(q) || exec_queue_banned(q);
return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED;
}

static void set_exec_queue_wedged(struct xe_exec_queue *q)
{
atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
}

static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
{
return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
(EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED));
}

#ifdef CONFIG_PROVE_LOCKING
Expand Down Expand Up @@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
free_submit_wq(guc);
}

static void guc_submit_wedged_fini(struct drm_device *drm, void *arg)
{
struct xe_guc *guc = arg;
struct xe_exec_queue *q;
unsigned long index;

xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
if (exec_queue_wedged(q))
xe_exec_queue_put(q);
}

static const struct xe_exec_queue_ops guc_exec_queue_ops;

static void primelockdep(struct xe_guc *guc)
Expand Down Expand Up @@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)

trace_xe_sched_job_run(job);

if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) {
if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
if (!exec_queue_registered(q))
register_engine(q);
if (!lr) /* LR jobs are emitted in the exec IOCTL */
Expand Down Expand Up @@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
xe_sched_tdr_queue_imm(&q->guc->sched);
}

static void guc_submit_wedged(struct xe_guc *guc)
{
struct xe_exec_queue *q;
unsigned long index;
int err;

xe_device_declare_wedged(guc_to_xe(guc));
xe_guc_submit_reset_prepare(guc);
xe_guc_ct_stop(&guc->ct);

err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
guc_submit_wedged_fini, guc);
if (err)
return;

mutex_lock(&guc->submission_state.lock);
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
if (xe_exec_queue_get_unless_zero(q))
set_exec_queue_wedged(q);
mutex_unlock(&guc->submission_state.lock);
}

static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
{
struct xe_guc_exec_queue *ge =
Expand All @@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_device *xe = guc_to_xe(guc);
struct xe_gpu_scheduler *sched = &ge->sched;
bool wedged = xe_device_wedged(xe);

xe_assert(xe, xe_exec_queue_is_lr(q));
trace_xe_exec_queue_lr_cleanup(q);

if (!wedged && xe_modparam.wedged_mode == 2) {
guc_submit_wedged(exec_queue_to_guc(q));
wedged = true;
}

/* Kill the run_job / process_msg entry points */
xe_sched_submission_stop(sched);

Expand All @@ -870,7 +922,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
* xe_guc_deregister_done_handler() which treats it as an unexpected
* state.
*/
if (exec_queue_registered(q) && !exec_queue_destroyed(q)) {
if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
struct xe_guc *guc = exec_queue_to_guc(q);
int ret;

Expand Down Expand Up @@ -905,6 +957,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
int err = -ETIME;
int i = 0;
bool wedged = xe_device_wedged(xe);

/*
* TDR has fired before free job worker. Common if exec queue
Expand All @@ -928,15 +981,20 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)

trace_xe_sched_job_timedout(job);

if (!wedged && xe_modparam.wedged_mode == 2) {
guc_submit_wedged(exec_queue_to_guc(q));
wedged = true;
}

/* Kill the run_job entry point */
xe_sched_submission_stop(sched);

/*
* Kernel jobs should never fail, nor should VM jobs if they do
* somethings has gone wrong and the GT needs a reset
*/
if (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) {
if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
if (!xe_sched_invalidate_job(job, 2)) {
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
Expand All @@ -946,7 +1004,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
}

/* Engine state now stable, disable scheduling if needed */
if (exec_queue_registered(q)) {
if (!wedged && exec_queue_registered(q)) {
struct xe_guc *guc = exec_queue_to_guc(q);
int ret;

Expand Down Expand Up @@ -989,6 +1047,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
*/
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);

xe_guc_exec_queue_trigger_cleanup(q);

/* Mark all outstanding jobs as bad, thus completing them */
Expand Down Expand Up @@ -1028,7 +1087,7 @@ static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);

/* We must block on kernel engines so slabs are empty on driver unload */
if (q->flags & EXEC_QUEUE_FLAG_PERMANENT)
if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
__guc_exec_queue_fini_async(&q->guc->fini_async);
else
queue_work(system_wq, &q->guc->fini_async);
Expand Down Expand Up @@ -1063,7 +1122,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)

static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
{
return !exec_queue_killed_or_banned(q) && exec_queue_registered(q);
return !exec_queue_killed_or_banned_or_wedged(q) && exec_queue_registered(q);
}

static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg)
Expand Down Expand Up @@ -1274,7 +1333,7 @@ static void guc_exec_queue_fini(struct xe_exec_queue *q)
{
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;

if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT))
if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
guc_exec_queue_add_msg(q, msg, CLEANUP);
else
__guc_exec_queue_fini(exec_queue_to_guc(q), q);
Expand All @@ -1285,7 +1344,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
{
struct xe_sched_msg *msg;

if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q))
if (q->sched_props.priority == priority ||
exec_queue_killed_or_banned_or_wedged(q))
return 0;

msg = kmalloc(sizeof(*msg), GFP_KERNEL);
Expand All @@ -1303,7 +1363,7 @@ static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_u
struct xe_sched_msg *msg;

if (q->sched_props.timeslice_us == timeslice_us ||
exec_queue_killed_or_banned(q))
exec_queue_killed_or_banned_or_wedged(q))
return 0;

msg = kmalloc(sizeof(*msg), GFP_KERNEL);
Expand All @@ -1322,7 +1382,7 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
struct xe_sched_msg *msg;

if (q->sched_props.preempt_timeout_us == preempt_timeout_us ||
exec_queue_killed_or_banned(q))
exec_queue_killed_or_banned_or_wedged(q))
return 0;

msg = kmalloc(sizeof(*msg), GFP_KERNEL);
Expand All @@ -1339,7 +1399,7 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
{
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;

if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending)
if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
return -EINVAL;

q->guc->suspend_pending = true;
Expand Down Expand Up @@ -1485,7 +1545,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
{
struct xe_gpu_scheduler *sched = &q->guc->sched;

if (!exec_queue_killed_or_banned(q)) {
if (!exec_queue_killed_or_banned_or_wedged(q)) {
int i;

trace_xe_exec_queue_resubmit(q);
Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/xe/xe_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ struct xe_modparam xe_modparam = {
.enable_display = true,
.guc_log_level = 5,
.force_probe = CONFIG_DRM_XE_FORCE_PROBE,
.wedged_mode = 1,
/* the rest are 0 by default */
};

Expand Down Expand Up @@ -55,6 +56,10 @@ MODULE_PARM_DESC(max_vfs,
"(0 = no VFs [default]; N = allow up to N VFs)");
#endif

module_param_named_unsafe(wedged_mode, xe_modparam.wedged_mode, int, 0600);
MODULE_PARM_DESC(wedged_mode,
"Module's default policy for the wedged mode - 0=never, 1=upon-critical-errors[default], 2=upon-any-hang");

struct init_funcs {
int (*init)(void);
void (*exit)(void);
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/xe/xe_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct xe_modparam {
#ifdef CONFIG_PCI_IOV
unsigned int max_vfs;
#endif
int wedged_mode;
};

extern struct xe_modparam xe_modparam;
Expand Down

0 comments on commit 8ed9aaa

Please sign in to comment.