From a17f574ab4a2d3dcbd9a49e3c1710fb0cbe8a901 Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Thu, 26 Oct 2023 21:23:06 +0800 Subject: drm/amdgpu: remove amdgpu_mes_self_test in gpu recover MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpu tlb flush is skipped if reset sem is held, it makes mes_self_test fail since it involves add_hw_queue/remove_hw_queue which needs tlb flush functional. Remove mes_self_test in gpu recover sequence. This patch is to fix the recover failure in gfx11. [ 1831.768292] [drm] ring sdma_32769.3.3 was added [ 1831.768313] [drm] ring gfx_32769.1.1 ib test pass [ 1831.768337] [drm] ring compute_32769.2.2 ib test pass [ 1831.768399] amdgpu 0000:c2:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32769, for process pid 0 thread pid 0) [ 1831.768434] amdgpu 0000:c2:00.0: amdgpu: in page starting at address 0x0000aec200000000 from client 10 [ 1831.768456] amdgpu 0000:c2:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00800A30 [ 1831.768473] amdgpu 0000:c2:00.0: amdgpu: Faulty UTCL2 client ID: CPC (0x5) [ 1831.768489] amdgpu 0000:c2:00.0: amdgpu: MORE_FAULTS: 0x0 [ 1831.768501] amdgpu 0000:c2:00.0: amdgpu: WALKER_ERROR: 0x0 [ 1831.768513] amdgpu 0000:c2:00.0: amdgpu: PERMISSION_FAULTS: 0x3 [ 1831.768521] amdgpu 0000:c2:00.0: amdgpu: MAPPING_ERROR: 0x0 [ 1831.768529] amdgpu 0000:c2:00.0: amdgpu: RW: 0x0 [ 1831.931229] amdgpu 0000:c2:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring sdma_32769.3.3 test failed (-110) [ 1832.062917] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=3 [ 1832.063107] [drm:amdgpu_mes_remove_hw_queue [amdgpu]] *ERROR* failed to remove hardware queue, queue id = 3 Fixes: e2e3788850b9 ("drm/amdgpu: rework lock handling for flush_tlb v2") Reported-by: Li Ma Reviewed-by: Christian König Signed-off-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d5f78179b2b6..bd940cb9322e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5566,10 +5566,6 @@ skip_hw_reset: drm_sched_start(&ring->sched, true); } - if (adev->enable_mes && - amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3)) - amdgpu_mes_self_test(tmp_adev); - if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); -- cgit v1.2.3 From 5f38ac54e60562323ea4abb1bfb37d043ee23357 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Wed, 25 Oct 2023 11:20:38 +0800 Subject: drm/amd/pm: fix the high voltage and temperature issue fix the high voltage and temperature issue after the driver is unloaded on smu 13.0.0, smu 13.0.7 and smu 13.0.10 v2 - fix the code format and make sure it is used on the unload case only. Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 +++++++++++----- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 33 ++++++++++++++++++++-- drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 1 + drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 2 ++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 13 +++++++++ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 18 ++++++++++-- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 18 ++++++++++-- 7 files changed, 94 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bd940cb9322e..436bd8455573 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3962,13 +3962,23 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } } else { - tmp = amdgpu_reset_method; - /* It should do a default reset when loading or reloading the driver, - * regardless of the module parameter reset_method. - */ - amdgpu_reset_method = AMD_RESET_METHOD_NONE; - r = amdgpu_asic_reset(adev); - amdgpu_reset_method = tmp; + switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { + case IP_VERSION(13, 0, 0): + case IP_VERSION(13, 0, 7): + case IP_VERSION(13, 0, 10): + r = psp_gpu_reset(adev); + break; + default: + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; + r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; + break; + } + if (r) { dev_err(adev->dev, "asic reset on init failed\n"); goto failed; diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 9f86c1fecbb1..23b00eddc1af 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -733,7 +733,7 @@ static int smu_early_init(void *handle) smu->adev = adev; smu->pm_enabled = !!amdgpu_dpm; smu->is_apu = false; - smu->smu_baco.state = SMU_BACO_STATE_EXIT; + smu->smu_baco.state = SMU_BACO_STATE_NONE; smu->smu_baco.platform_support = false; smu->user_dpm_profile.fan_mode = -1; @@ -1742,10 +1742,31 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) return 0; } +static int smu_reset_mp1_state(struct smu_context *smu) +{ + struct amdgpu_device *adev = smu->adev; + int ret = 0; + + if ((!adev->in_runpm) && (!adev->in_suspend) && + (!amdgpu_in_reset(adev))) + switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { + case IP_VERSION(13, 0, 0): + case IP_VERSION(13, 0, 7): + case IP_VERSION(13, 0, 10): + ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD); + break; + default: + break; + } + + return ret; +} + static int smu_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = adev->powerplay.pp_handle; + int ret; if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) return 0; @@ -1763,7 +1784,15 @@ static int smu_hw_fini(void *handle) adev->pm.dpm_enabled = false; - return smu_smc_hw_cleanup(smu); + ret = smu_smc_hw_cleanup(smu); + if (ret) + return ret; + + ret = smu_reset_mp1_state(smu); + if (ret) + return ret; + + return 0; } static void smu_late_fini(void *handle) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index 839553a86aa2..8def291b18bc 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -419,6 +419,7 @@ enum smu_reset_mode { enum smu_baco_state { SMU_BACO_STATE_ENTER = 0, SMU_BACO_STATE_EXIT, + SMU_BACO_STATE_NONE, }; struct smu_baco_context { diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index cc02f979e9e9..95cb919718ae 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -299,5 +299,7 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu, uint8_t pcie_gen_cap, uint8_t pcie_width_cap); +int smu_v13_0_disable_pmfw_state(struct smu_context *smu); + #endif #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index a49e5adf7cc3..cf1b84060bc3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -2477,3 +2477,16 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu, return 0; } + +int smu_v13_0_disable_pmfw_state(struct smu_context *smu) +{ + int ret; + struct amdgpu_device *adev = smu->adev; + + WREG32_PCIE(MP1_Public | (smnMP1_FIRMWARE_FLAGS & 0xffffffff), 0); + + ret = RREG32_PCIE(MP1_Public | + (smnMP1_FIRMWARE_FLAGS & 0xffffffff)); + + return ret == 0 ? 0 : -EINVAL; +} diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 34bd99b0e137..997934bc12cf 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2602,14 +2602,20 @@ static int smu_v13_0_0_baco_enter(struct smu_context *smu) static int smu_v13_0_0_baco_exit(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; + int ret; if (adev->in_runpm && smu_cmn_is_audio_func_enabled(adev)) { /* Wait for PMFW handling for the Dstate change */ usleep_range(10000, 11000); - return smu_v13_0_baco_set_armd3_sequence(smu, BACO_SEQ_ULPS); + ret = smu_v13_0_baco_set_armd3_sequence(smu, BACO_SEQ_ULPS); } else { - return smu_v13_0_baco_exit(smu); + ret = smu_v13_0_baco_exit(smu); } + + if (!ret) + adev->gfx.is_poweron = false; + + return ret; } static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu) @@ -2794,7 +2800,13 @@ static int smu_v13_0_0_set_mp1_state(struct smu_context *smu, switch (mp1_state) { case PP_MP1_STATE_UNLOAD: - ret = smu_cmn_set_mp1_state(smu, mp1_state); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_PrepareMp1ForUnload, + 0x55, NULL); + + if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT) + ret = smu_v13_0_disable_pmfw_state(smu); + break; default: /* Ignore others */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index ac0e1cc812bd..692780ef21d0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2498,7 +2498,13 @@ static int smu_v13_0_7_set_mp1_state(struct smu_context *smu, switch (mp1_state) { case PP_MP1_STATE_UNLOAD: - ret = smu_cmn_set_mp1_state(smu, mp1_state); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_PrepareMp1ForUnload, + 0x55, NULL); + + if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT) + ret = smu_v13_0_disable_pmfw_state(smu); + break; default: /* Ignore others */ @@ -2524,14 +2530,20 @@ static int smu_v13_0_7_baco_enter(struct smu_context *smu) static int smu_v13_0_7_baco_exit(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; + int ret; if (adev->in_runpm && smu_cmn_is_audio_func_enabled(adev)) { /* Wait for PMFW handling for the Dstate change */ usleep_range(10000, 11000); - return smu_v13_0_baco_set_armd3_sequence(smu, BACO_SEQ_ULPS); + ret = smu_v13_0_baco_set_armd3_sequence(smu, BACO_SEQ_ULPS); } else { - return smu_v13_0_baco_exit(smu); + ret = smu_v13_0_baco_exit(smu); } + + if (!ret) + adev->gfx.is_poweron = false; + + return ret; } static bool smu_v13_0_7_is_mode1_reset_supported(struct smu_context *smu) -- cgit v1.2.3 From 7b1c6263eaf4fd64ffe1cafdc504a42ee4bfbb33 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Oct 2023 16:30:00 -0400 Subject: drm/amdgpu: don't use pci_is_thunderbolt_attached() It's only valid on Intel systems with the Intel VSEC. Use dev_is_removable() instead. This should do the right thing regardless of the platform. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2925 Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 436bd8455573..920f52fc91f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -2223,7 +2224,6 @@ out: */ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) { - struct drm_device *dev = adev_to_drm(adev); struct pci_dev *parent; int i, r; bool total; @@ -2294,7 +2294,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) (amdgpu_is_atpx_hybrid() || amdgpu_has_atpx_dgpu_power_cntl()) && ((adev->flags & AMD_IS_APU) == 0) && - !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) + !dev_is_removable(&adev->pdev->dev)) adev->flags |= AMD_IS_PX; if (!(adev->flags & AMD_IS_APU)) { @@ -4142,7 +4142,7 @@ fence_driver_init: px = amdgpu_device_supports_px(ddev); - if (px || (!pci_is_thunderbolt_attached(adev->pdev) && + if (px || (!dev_is_removable(&adev->pdev->dev) && apple_gmux_detect(NULL, NULL))) vga_switcheroo_register_client(adev->pdev, &amdgpu_switcheroo_ops, px); @@ -4292,7 +4292,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) px = amdgpu_device_supports_px(adev_to_drm(adev)); - if (px || (!pci_is_thunderbolt_attached(adev->pdev) && + if (px || (!dev_is_removable(&adev->pdev->dev) && apple_gmux_detect(NULL, NULL))) vga_switcheroo_unregister_client(adev->pdev); diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c index e523627cfe25..df218d5ca775 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c @@ -28,6 +28,7 @@ #include "nbio/nbio_2_3_offset.h" #include "nbio/nbio_2_3_sh_mask.h" #include +#include #include #define smnPCIE_CONFIG_CNTL 0x11180044 @@ -361,7 +362,7 @@ static void nbio_v2_3_enable_aspm(struct amdgpu_device *adev, data |= NAVI10_PCIE__LC_L0S_INACTIVITY_DEFAULT << PCIE_LC_CNTL__LC_L0S_INACTIVITY__SHIFT; - if (pci_is_thunderbolt_attached(adev->pdev)) + if (dev_is_removable(&adev->pdev->dev)) data |= NAVI10_PCIE__LC_L1_INACTIVITY_TBT_DEFAULT << PCIE_LC_CNTL__LC_L1_INACTIVITY__SHIFT; else data |= NAVI10_PCIE__LC_L1_INACTIVITY_DEFAULT << PCIE_LC_CNTL__LC_L1_INACTIVITY__SHIFT; @@ -480,7 +481,7 @@ static void nbio_v2_3_program_aspm(struct amdgpu_device *adev) def = data = RREG32_PCIE(smnPCIE_LC_CNTL); data |= NAVI10_PCIE__LC_L0S_INACTIVITY_DEFAULT << PCIE_LC_CNTL__LC_L0S_INACTIVITY__SHIFT; - if (pci_is_thunderbolt_attached(adev->pdev)) + if (dev_is_removable(&adev->pdev->dev)) data |= NAVI10_PCIE__LC_L1_INACTIVITY_TBT_DEFAULT << PCIE_LC_CNTL__LC_L1_INACTIVITY__SHIFT; else data |= NAVI10_PCIE__LC_L1_INACTIVITY_DEFAULT << PCIE_LC_CNTL__LC_L1_INACTIVITY__SHIFT; -- cgit v1.2.3 From 23618280cca543183d29ae4f286e3319066774d2 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 19 Oct 2023 14:47:53 +0800 Subject: drm/amdgpu: Query and report boot status Query boot status and report boot errors. A follow up change is needed to stop GPU initialization if boot fails. v2: only invoke the call for dGPU (Le/Lijo) Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Reviewed-by: Yang Wang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 920f52fc91f0..fd8cd8e2d3f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1074,6 +1074,8 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev) amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { amdgpu_psp_wait_for_bootloader(adev); ret = amdgpu_atomfirmware_asic_init(adev, true); + /* TODO: check the return val and stop device initialization if boot fails */ + amdgpu_psp_query_boot_status(adev); return ret; } else { return amdgpu_atom_asic_init(adev->mode_info.atom_context); -- cgit v1.2.3