Skip to content

Commit

Permalink
Merge tag 'amd-drm-next-5.18-2022-03-18' of https://gitlab.freedeskto…
Browse files Browse the repository at this point in the history
…p.org/agd5f/linux into drm-next

amd-drm-next-5.18-2022-03-18:

amdgpu:
- Aldebaran fixes
- SMU 13.0.5 fixes
- DCN 3.1.5 fixes
- DCN 3.1.6 fixes
- Pipe split fixes
- More display FP cleanup
- DP 2.0 UHBR fix
- DC GPU reset fix
- DC deep color ratio fix
- SMU robustness fixes
- Runtime PM fix for APUs
- IGT reload fixes
- SR-IOV fix
- Misc fixes and cleanups

amdkfd:
- CRIU fixes
- SVM fixes

UAPI:
- Properly handle SDMA transfers with CRIU
  Proposed user mode change: checkpoint-restore/criu#1709

Signed-off-by: Dave Airlie <[email protected]>
From: Alex Deucher <[email protected]>
Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
  • Loading branch information
airlied committed Mar 21, 2022
2 parents f11de86 + 426c89a commit c6e90a1
Show file tree
Hide file tree
Showing 117 changed files with 3,944 additions and 2,944 deletions.
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_hdp.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o

amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ amdgpu_connector_fixup_lcd_native_mode(struct drm_encoder *encoder,
if (mode->type & DRM_MODE_TYPE_PREFERRED) {
if (mode->hdisplay != native_mode->hdisplay ||
mode->vdisplay != native_mode->vdisplay)
memcpy(native_mode, mode, sizeof(*mode));
drm_mode_copy(native_mode, mode);
}
}

Expand All @@ -635,7 +635,7 @@ amdgpu_connector_fixup_lcd_native_mode(struct drm_encoder *encoder,
list_for_each_entry_safe(mode, t, &connector->probed_modes, head) {
if (mode->hdisplay == native_mode->hdisplay &&
mode->vdisplay == native_mode->vdisplay) {
*native_mode = *mode;
drm_mode_copy(native_mode, mode);
drm_mode_set_crtcinfo(native_mode, CRTC_INTERLACE_HALVE_V);
DRM_DEBUG_KMS("Determined LVDS native mode details from EDID\n");
break;
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
s64 min_us;

/* Be more aggresive on dGPUs. Try to fill a portion of free
/* Be more aggressive on dGPUs. Try to fill a portion of free
* VRAM now.
*/
if (!(adev->flags & AMD_IS_APU))
Expand Down Expand Up @@ -1280,7 +1280,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
continue;

/*
* Work around dma_resv shortcommings by wrapping up the
* Work around dma_resv shortcomings by wrapping up the
* submission in a dma_fence_chain and add it as exclusive
* fence.
*/
Expand Down
24 changes: 13 additions & 11 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -2159,8 +2159,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
!pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
adev->flags |= AMD_IS_PX;

parent = pci_upstream_bridge(adev->pdev);
adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
if (!(adev->flags & AMD_IS_APU)) {
parent = pci_upstream_bridge(adev->pdev);
adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
}

amdgpu_amdkfd_device_probe(adev);

Expand Down Expand Up @@ -3664,6 +3666,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
adev->enable_mes = true;

/*
* Reset domain needs to be present early, before XGMI hive discovered
* (if any) and intitialized to use reset sem and in_gpu reset flag
* early on during init and before calling to RREG32.
*/
adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
if (!adev->reset_domain)
return -ENOMEM;

/* detect hw virtualization here */
amdgpu_detect_virtualization(adev);

Expand All @@ -3673,15 +3684,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
}

/*
* Reset domain needs to be present early, before XGMI hive discovered
* (if any) and intitialized to use reset sem and in_gpu reset flag
* early on during init.
*/
adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
if (!adev->reset_domain)
return -ENOMEM;

/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
Expand Down
41 changes: 22 additions & 19 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,13 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev)
{
unsigned size;

/*
* Some ASICs need to reserve a region of video memory to avoid access
* from driver
*/
adev->mman.stolen_reserved_offset = 0;
adev->mman.stolen_reserved_size = 0;

/*
* TODO:
* Currently there is a bug where some memory client outside
Expand All @@ -632,10 +639,25 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev)
*/
switch (adev->asic_type) {
case CHIP_VEGA10:
adev->mman.keep_stolen_vga_memory = true;
/*
* VEGA10 SRIOV VF needs some firmware reserved area.
*/
if (amdgpu_sriov_vf(adev)) {
adev->mman.stolen_reserved_offset = 0x100000;
adev->mman.stolen_reserved_size = 0x600000;
}
break;
case CHIP_RAVEN:
case CHIP_RENOIR:
adev->mman.keep_stolen_vga_memory = true;
break;
case CHIP_YELLOW_CARP:
if (amdgpu_discovery == 0) {
adev->mman.stolen_reserved_offset = 0x1ffb0000;
adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
}
break;
default:
adev->mman.keep_stolen_vga_memory = false;
break;
Expand Down Expand Up @@ -756,25 +778,6 @@ uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo
return amdgpu_bo_gpu_offset(bo) - adev->gmc.vram_start + adev->gmc.aper_base;
}

void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev)
{
/* Some ASICs need to reserve a region of video memory to avoid access
* from driver */
adev->mman.stolen_reserved_offset = 0;
adev->mman.stolen_reserved_size = 0;

switch (adev->asic_type) {
case CHIP_YELLOW_CARP:
if (amdgpu_discovery == 0) {
adev->mman.stolen_reserved_offset = 0x1ffb0000;
adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
}
break;
default:
break;
}
}

int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
{
struct amdgpu_bo *vram_bo = NULL;
Expand Down
1 change: 0 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,6 @@ amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, int hub_type,
bool enable);

void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);
void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev);

void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev);
uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
}

if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) &&
(ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) {
dev_err(adev->dev, "secure submissions not supported on compute rings\n");
(!ring->funcs->secure_submission_supported)) {
dev_err(adev->dev, "secure submissions not supported on ring <%s>\n", ring->name);
return -EINVAL;
}

Expand Down
6 changes: 4 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,10 @@ static int psp_sw_init(void *handle)
return ret;
}

adev->psp.xgmi_context.supports_extended_data =
!adev->gmc.xgmi.connected_to_cpu &&
adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 2);

memset(&boot_cfg_entry, 0, sizeof(boot_cfg_entry));
if (psp_get_runtime_db_entry(adev,
PSP_RUNTIME_ENTRY_TYPE_BOOT_CONFIG,
Expand Down Expand Up @@ -3008,7 +3012,6 @@ static int psp_init_sos_base_fw(struct amdgpu_device *adev)
adev->psp.sos.size_bytes = le32_to_cpu(sos_hdr->sos.size_bytes);
adev->psp.sos.start_addr = ucode_array_start_addr +
le32_to_cpu(sos_hdr->sos.offset_bytes);
adev->psp.xgmi_context.supports_extended_data = false;
} else {
/* Load alternate PSP SOS FW */
sos_hdr_v1_3 = (const struct psp_firmware_header_v1_3 *)adev->psp.sos_fw->data;
Expand All @@ -3023,7 +3026,6 @@ static int psp_init_sos_base_fw(struct amdgpu_device *adev)
adev->psp.sos.size_bytes = le32_to_cpu(sos_hdr_v1_3->sos_aux.size_bytes);
adev->psp.sos.start_addr = ucode_array_start_addr +
le32_to_cpu(sos_hdr_v1_3->sos_aux.offset_bytes);
adev->psp.xgmi_context.supports_extended_data = true;
}

if ((adev->psp.sys.size_bytes == 0) || (adev->psp.sos.size_bytes == 0)) {
Expand Down
7 changes: 7 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;

max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
Expand All @@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
goto free;

amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
}

#ifdef CONFIG_X86_MCE_AMD
Expand Down Expand Up @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}

con->update_channel_flag = false;
con->features = 0;
INIT_LIST_HEAD(&con->head);
/* Might need get this flag from vbios. */
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ struct amdgpu_ras {

/* record umc error info queried from smu */
struct umc_ecc_info umc_ecc;

/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
};

struct ras_fs_data {
Expand Down
25 changes: 23 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
u8 csum;
int res;

Expand All @@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)

amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);

control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
con->update_channel_flag = false;

amdgpu_ras_debugfs_set_ret_size(control);

mutex_unlock(&control->ras_tbl_mutex);
Expand Down Expand Up @@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *record,
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
u32 a, b, i;
u8 *buf, *pp;
int res;
Expand All @@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
/* Encode all of them in one go.
*/
pp = buf;
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__encode_table_record_to_buf(control, &record[i], pp);

/* update bad channel bitmap */
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
}

/* a, first record index to write into.
* b, last record index to write into.
* a = first index to read (fri) + number of records in the table,
Expand Down Expand Up @@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int i, res;
u8 *buf, *pp;
u32 g0, g1;
Expand Down Expand Up @@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
/* Read up everything? Then transform.
*/
pp = buf;
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__decode_table_record_from_buf(control, &record[i], pp);

/* update bad channel bitmap */
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
}
Out:
kfree(buf);
mutex_unlock(&control->ras_tbl_mutex);
Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
/* Protect table access via this mutex.
*/
struct mutex ras_tbl_mutex;

/* Record channel info which occurred bad pages
*/
u32 bad_channel_bitmap;
};

/*
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ struct amdgpu_ring_funcs {
u32 nop;
bool support_64bit_ptrs;
bool no_user_fence;
bool secure_submission_supported;
unsigned vmhub;
unsigned extra_dw;

Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_ras_save_bad_pages(adev);

amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
}

if (reset)
Expand Down
22 changes: 0 additions & 22 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

#include "amdgpu_reset.h"

#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210

Expand Down Expand Up @@ -69,17 +68,6 @@ static const int wafl_pcs_err_status_reg_arct[] = {
smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
};

static const int xgmi23_pcs_err_status_reg_aldebaran[] = {
smnPCS_XGMI23_PCS_ERROR_STATUS,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000,
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000
};

static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
smnPCS_XGMI3X16_PCS_ERROR_STATUS,
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
Expand Down Expand Up @@ -797,9 +785,6 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
xgmi_pcs_err_status_reg_vg20[i]);
break;
case CHIP_ALDEBARAN:
for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
pcs_clear_status(adev,
xgmi23_pcs_err_status_reg_aldebaran[i]);
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
pcs_clear_status(adev,
xgmi3x16_pcs_err_status_reg_aldebaran[i]);
Expand Down Expand Up @@ -900,13 +885,6 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
}
break;
case CHIP_ALDEBARAN:
/* check xgmi23 pcs error */
for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) {
data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]);
if (data)
amdgpu_xgmi_query_pcs_error_status(adev,
data, &ue_cnt, &ce_cnt, true);
}
/* check xgmi3x16 pcs error */
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
Expand Down
Loading

0 comments on commit c6e90a1

Please sign in to comment.