Skip to content

Commit

Permalink
Merge tag 'amd-drm-next-6.12-2024-09-06' of https://gitlab.freedeskto…
Browse files Browse the repository at this point in the history
…p.org/agd5f/linux into drm-next

amd-drm-next-6.12-2024-09-06:

amdgpu:
- IPS updates
- Post divider fix
- DML2 updates
- Misc static checker fixes
- DCN 3.5 fixes
- Replay fixes
- DMCUB updates
- SWSMU fixes
- DP MST fixes
- Add debug flag for per queue resets
- devcoredump updates
- SR-IOV fixes
- MES fixes
- Always allocate cleared VRAM for GEM
- Pipe reset for GC 9.4.3
- ODM policy fixes
- Per queue reset support for GC 10
- Per queue reset support for GC 11
- Per queue reset support for GC 12
- Display flickering fixes
- MPO fixes
- Display sharpening updates

amdkfd:
- SVM fix for IH for APUs

Signed-off-by: Dave Airlie <[email protected]>

From: Alex Deucher <[email protected]>
Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
  • Loading branch information
airlied committed Sep 11, 2024
2 parents 32bd3eb + 7a19955 commit 741d73f
Show file tree
Hide file tree
Showing 75 changed files with 6,024 additions and 1,359 deletions.
2 changes: 1 addition & 1 deletion Documentation/gpu/amdgpu/driver-core.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,4 @@ IP Blocks
:doc: IP Blocks

.. kernel-doc:: drivers/gpu/drm/amd/include/amd_shared.h
:identifiers: amd_ip_block_type amd_ip_funcs
:identifiers: amd_ip_block_type amd_ip_funcs DC_DEBUG_MASK
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,7 @@ struct amdgpu_device {
bool debug_disable_soft_recovery;
bool debug_use_vram_fw_buf;
bool debug_enable_ras_aca;
bool debug_exp_resets;

bool enforce_isolation[MAX_XCP];
/* Added this mutex for cleaner shader isolation between GFX and compute processes */
Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,10 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
uint32_t low, high;
uint64_t queue_addr = 0;

if (!adev->debug_exp_resets &&
!adev->gfx.num_gfx_rings)
return 0;

kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);

Expand Down
20 changes: 11 additions & 9 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
#include "atom.h"

#ifndef CONFIG_DEV_COREDUMP
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context)
void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
bool vram_lost, struct amdgpu_job *job)
{
}
#else
Expand Down Expand Up @@ -315,7 +315,9 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
}
}

if (coredump->reset_vram_lost)
if (coredump->skip_vram_check)
drm_printf(&p, "VRAM lost check is skipped!\n");
else if (coredump->reset_vram_lost)
drm_printf(&p, "VRAM is lost due to GPU reset!\n");

return count - iter.remain;
Expand All @@ -326,12 +328,11 @@ static void amdgpu_devcoredump_free(void *data)
kfree(data);
}

void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context)
void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
bool vram_lost, struct amdgpu_job *job)
{
struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);
struct amdgpu_job *job = reset_context->job;
struct amdgpu_coredump_info *coredump;
struct drm_sched_job *s_job;

coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
Expand All @@ -341,11 +342,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
return;
}

coredump->skip_vram_check = skip_vram_check;
coredump->reset_vram_lost = vram_lost;

if (reset_context->job && reset_context->job->vm) {
if (job && job->vm) {
struct amdgpu_vm *vm = job->vm;
struct amdgpu_task_info *ti;
struct amdgpu_vm *vm = reset_context->job->vm;

ti = amdgpu_vm_get_task_info_vm(vm);
if (ti) {
Expand Down
7 changes: 3 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#define __AMDGPU_DEV_COREDUMP_H__

#include "amdgpu.h"
#include "amdgpu_reset.h"

#ifdef CONFIG_DEV_COREDUMP

Expand All @@ -36,12 +35,12 @@ struct amdgpu_coredump_info {
struct amdgpu_device *adev;
struct amdgpu_task_info reset_task_info;
struct timespec64 reset_time;
bool skip_vram_check;
bool reset_vram_lost;
struct amdgpu_ring *ring;
};
#endif

void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context);

void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
bool vram_lost, struct amdgpu_job *job);
#endif
8 changes: 4 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -4531,6 +4531,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
{
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(&adev->delayed_init_work);

if (adev->mman.initialized)
drain_workqueue(adev->mman.bdev.wq);
adev->shutdown = true;

/* make sure IB test finished before entering exclusive mode
Expand All @@ -4551,9 +4554,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
}
amdgpu_fence_driver_hw_fini(adev);

if (adev->mman.initialized)
drain_workqueue(adev->mman.bdev.wq);

if (adev->pm.sysfs_initialized)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
Expand Down Expand Up @@ -5489,7 +5489,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
amdgpu_coredump(tmp_adev, vram_lost, reset_context);
amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);

if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
Expand Down
6 changes: 6 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5),
};

unsigned int amdgpu_vram_limit = UINT_MAX;
Expand Down Expand Up @@ -2199,6 +2200,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: enable RAS ACA\n");
adev->debug_enable_ras_aca = true;
}

if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_EXP_RESETS) {
pr_info("debug: enable experimental reset features\n");
adev->debug_exp_resets = true;
}
}

static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
return -EINVAL;
}

/* always clear VRAM */
flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;

/* create a gem object to contain this object in */
if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
uint64_t queue_mask = 0;
int r, i, j;

if (adev->enable_mes)
if (adev->mes.enable_legacy_queue_map)
return amdgpu_gfx_mes_enable_kcq(adev, xcc_id);

if (!kiq->pmf || !kiq->pmf->kiq_map_queues || !kiq->pmf->kiq_set_resources)
Expand Down Expand Up @@ -722,7 +722,7 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id)

amdgpu_device_flush_hdp(adev, NULL);

if (adev->enable_mes) {
if (adev->mes.enable_legacy_queue_map) {
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
j = i + xcc_id * adev->gfx.num_gfx_rings;
r = amdgpu_mes_map_legacy_queue(adev,
Expand Down
67 changes: 66 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,60 @@
#include "amdgpu.h"
#include "amdgpu_trace.h"
#include "amdgpu_reset.h"
#include "amdgpu_dev_coredump.h"
#include "amdgpu_xgmi.h"

static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
int i;

dev_info(adev->dev, "Dumping IP State\n");
for (i = 0; i < adev->num_ip_blocks; i++)
if (adev->ip_blocks[i].version->funcs->dump_ip_state)
adev->ip_blocks[i].version->funcs
->dump_ip_state((void *)adev);
dev_info(adev->dev, "Dumping IP State Completed\n");

amdgpu_coredump(adev, true, false, job);
}

static void amdgpu_job_core_dump(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_device *tmp_adev = NULL;
struct amdgpu_hive_info *hive = NULL;

if (!amdgpu_sriov_vf(adev))
hive = amdgpu_get_xgmi_hive(adev);
if (hive)
mutex_lock(&hive->hive_lock);
/*
* Reuse the logic in amdgpu_device_gpu_recover() to build list of
* devices for code dump
*/
INIT_LIST_HEAD(&device_list);
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
list_add_tail(&tmp_adev->reset_list, &device_list);
if (!list_is_first(&adev->reset_list, &device_list))
list_rotate_to_front(&adev->reset_list, &device_list);
device_list_handle = &device_list;
} else {
list_add_tail(&adev->reset_list, &device_list);
device_list_handle = &device_list;
}

/* Do the coredump for each device */
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
amdgpu_job_do_core_dump(tmp_adev, job);

if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
}
}

static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
Expand All @@ -48,9 +102,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}


adev->job_hang = true;

/*
* Do the coredump immediately after a job timeout to get a very
* close dump/snapshot/representation of GPU's current error status
*/
amdgpu_job_core_dump(adev, job);

if (amdgpu_gpu_recovery &&
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
Expand Down Expand Up @@ -101,6 +160,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
reset_context.src = AMDGPU_RESET_SRC_JOB;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

/*
* To avoid an unnecessary extra coredump, as we have already
* got the very close representation of GPU's error status
*/
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);

r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
if (r)
dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
Expand Down
23 changes: 22 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,24 @@ int amdgpu_mes_reset_hw_queue(struct amdgpu_device *adev, int queue_id)
return 0;
}

int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device *adev, int queue_type,
int me_id, int pipe_id, int queue_id, int vmid)
{
struct mes_reset_queue_input queue_input;
int r;

queue_input.use_mmio = true;
queue_input.me_id = me_id;
queue_input.pipe_id = pipe_id;
queue_input.queue_id = queue_id;
queue_input.vmid = vmid;
r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
if (r)
DRM_ERROR("failed to reset hardware queue by mmio, queue id = %d\n",
queue_id);
return r;
}

int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring)
{
Expand Down Expand Up @@ -873,7 +891,8 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,

int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring,
unsigned int vmid)
unsigned int vmid,
bool use_mmio)
{
struct mes_reset_legacy_queue_input queue_input;
int r;
Expand All @@ -882,11 +901,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,

queue_input.queue_type = ring->funcs->type;
queue_input.doorbell_offset = ring->doorbell_index;
queue_input.me_id = ring->me;
queue_input.pipe_id = ring->pipe;
queue_input.queue_id = ring->queue;
queue_input.mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
queue_input.wptr_addr = ring->wptr_gpu_addr;
queue_input.vmid = vmid;
queue_input.use_mmio = use_mmio;

r = adev->mes.funcs->reset_legacy_queue(&adev->mes, &queue_input);
if (r)
Expand Down
15 changes: 14 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ struct amdgpu_mes {

uint32_t sched_version;
uint32_t kiq_version;
bool enable_legacy_queue_map;

uint32_t total_max_queue;
uint32_t max_doorbell_slices;
Expand Down Expand Up @@ -251,6 +252,13 @@ struct mes_remove_queue_input {
struct mes_reset_queue_input {
uint32_t doorbell_offset;
uint64_t gang_context_addr;
bool use_mmio;
uint32_t queue_type;
uint32_t me_id;
uint32_t pipe_id;
uint32_t queue_id;
uint32_t xcc_id;
uint32_t vmid;
};

struct mes_map_legacy_queue_input {
Expand Down Expand Up @@ -287,6 +295,8 @@ struct mes_resume_gang_input {
struct mes_reset_legacy_queue_input {
uint32_t queue_type;
uint32_t doorbell_offset;
bool use_mmio;
uint32_t me_id;
uint32_t pipe_id;
uint32_t queue_id;
uint64_t mqd_addr;
Expand Down Expand Up @@ -396,6 +406,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
int *queue_id);
int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id);
int amdgpu_mes_reset_hw_queue(struct amdgpu_device *adev, int queue_id);
int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device *adev, int queue_type,
int me_id, int pipe_id, int queue_id, int vmid);

int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring);
Expand All @@ -405,7 +417,8 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
u64 gpu_addr, u64 seq);
int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring,
unsigned int vmid);
unsigned int vmid,
bool use_mmio);

uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
int amdgpu_mes_wreg(struct amdgpu_device *adev,
Expand Down
Loading

0 comments on commit 741d73f

Please sign in to comment.