From a80fe1a698dcc7f5a493af9a16d516f8cbb20564 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 29 Jun 2023 17:48:38 +0800 Subject: drm/amdgpu: skip address adjustment for GFX RAS injection The address parameter of GFX RAS injection isn't related to XGMI node number, keep it unchanged. Signed-off-by: Tao Zhou Reviewed-by: Candice Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8aaa427f8c0f..06aec5f2dd1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1159,7 +1159,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, } /* Calculate XGMI relative offset */ - if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (adev->gmc.xgmi.num_physical_nodes > 1 && + info->head.block != AMDGPU_RAS_BLOCK__GFX) { block_info.address = amdgpu_xgmi_get_relative_phy_addr(adev, block_info.address); -- cgit v1.2.3 From cb906ce32b468099da17ef4181d60eaa77ee89d7 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Sun, 25 Jun 2023 18:19:08 +0800 Subject: drm/amdgpu: Enable aqua vanjaram RAS Enable RAS for aqua vanjaram. Changed from V1: Split the change in amdgpu_ras_asic_supported into a separated patch. Changed from V2: Avoid to modify global variable amdgpu_ras_enable. Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 06aec5f2dd1a..51731ccaa0a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2415,6 +2415,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) if (adev->asic_type == CHIP_IP_DISCOVERY) { switch (adev->ip_versions[MP0_HWIP][0]) { case IP_VERSION(13, 0, 0): + case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): return true; default: -- cgit v1.2.3 From 276f6e8cb769b0b9782d9daae307f6a182af3368 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 13 Jul 2023 21:07:05 +0800 Subject: drm/amdgpu: Disable RAS by default on APU flatform Disable RAS feature by default for aqua vanjaram on APU platform. Changed from V1: Splite Disable RAS by default on APU platform into a separated patch. Changed from V2: Avoid to modify global variable amdgpu_ras_enable. Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 51731ccaa0a0..fc317b005943 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2517,8 +2517,17 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) /* hw_supported needs to be aligned with RAS block mask. */ adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; - adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : - adev->ras_hw_enabled & amdgpu_ras_mask; + + /* + * Disable ras feature for aqua vanjaram + * by default on apu platform. + */ + if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6)) + adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : + adev->ras_hw_enabled & amdgpu_ras_mask; + else + adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_enabled & amdgpu_ras_mask; } static void amdgpu_ras_counte_dw(struct work_struct *work) -- cgit v1.2.3 From adf64e214280a0230af0638c1825b3812421c958 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 18 Jul 2023 13:02:58 -0500 Subject: drm/amd: Avoid reading the VBIOS part number twice The VBIOS part number is read both in amdgpu_atom_parse() as well as in atom_get_vbios_pn() and stored twice in the `struct atom_context` structure. Remove the first unnecessary read and move the `pr_info` line from that read into the second. v2: squash in unused variable removal Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c | 20 ++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/atom.c | 14 ++------------ drivers/gpu/drm/amd/amdgpu/atom.h | 2 -- 7 files changed, 22 insertions(+), 34 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index f4e3c133a16c..dce9e7d5e4ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -1776,7 +1776,7 @@ static ssize_t amdgpu_atombios_get_vbios_version(struct device *dev, struct amdgpu_device *adev = drm_to_adev(ddev); struct atom_context *ctx = adev->mode_info.atom_context; - return sysfs_emit(buf, "%s\n", ctx->vbios_version); + return sysfs_emit(buf, "%s\n", ctx->vbios_pn); } static DEVICE_ATTR(vbios_version, 0444, amdgpu_atombios_get_vbios_version, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c index 4620c4712ce3..c9f16eab0f3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c @@ -60,10 +60,10 @@ static bool is_fru_eeprom_supported(struct amdgpu_device *adev, u32 *fru_addr) switch (adev->asic_type) { case CHIP_VEGA20: /* D161 and D163 are the VG20 server SKUs */ - if (strnstr(atom_ctx->vbios_version, "D161", - sizeof(atom_ctx->vbios_version)) || - strnstr(atom_ctx->vbios_version, "D163", - sizeof(atom_ctx->vbios_version))) { + if (strnstr(atom_ctx->vbios_pn, "D161", + sizeof(atom_ctx->vbios_pn)) || + strnstr(atom_ctx->vbios_pn, "D163", + sizeof(atom_ctx->vbios_pn))) { if (fru_addr) *fru_addr = FRU_EEPROM_MADDR_6; return true; @@ -72,16 +72,16 @@ static bool is_fru_eeprom_supported(struct amdgpu_device *adev, u32 *fru_addr) } case CHIP_ALDEBARAN: /* All Aldebaran SKUs have an FRU */ - if (!strnstr(atom_ctx->vbios_version, "D673", - sizeof(atom_ctx->vbios_version))) + if (!strnstr(atom_ctx->vbios_pn, "D673", + sizeof(atom_ctx->vbios_pn))) if (fru_addr) *fru_addr = FRU_EEPROM_MADDR_6; return true; case CHIP_SIENNA_CICHLID: - if (strnstr(atom_ctx->vbios_version, "D603", - sizeof(atom_ctx->vbios_version))) { - if (strnstr(atom_ctx->vbios_version, "D603GLXE", - sizeof(atom_ctx->vbios_version))) { + if (strnstr(atom_ctx->vbios_pn, "D603", + sizeof(atom_ctx->vbios_pn))) { + if (strnstr(atom_ctx->vbios_pn, "D603GLXE", + sizeof(atom_ctx->vbios_pn))) { return false; } else { if (fru_addr) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index cab2fdd5b76a..12f3c7ad7d04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1722,7 +1722,7 @@ static int amdgpu_debugfs_firmware_info_show(struct seq_file *m, void *unused) seq_printf(m, "MES feature version: %u, firmware version: 0x%08x\n", fw_info.feature, fw_info.ver); - seq_printf(m, "VBIOS version: %s\n", ctx->vbios_version); + seq_printf(m, "VBIOS version: %s\n", ctx->vbios_pn); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fc317b005943..0ba86cc88481 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2442,10 +2442,10 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) if (!ctx) return; - if (strnstr(ctx->vbios_version, "D16406", - sizeof(ctx->vbios_version)) || - strnstr(ctx->vbios_version, "D36002", - sizeof(ctx->vbios_version))) + if (strnstr(ctx->vbios_pn, "D16406", + sizeof(ctx->vbios_pn)) || + strnstr(ctx->vbios_pn, "D36002", + sizeof(ctx->vbios_pn))) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 0648dfe559af..4287743e1212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -194,9 +194,9 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, /* VEGA20 and ARCTURUS */ if (adev->asic_type == CHIP_VEGA20) control->i2c_address = EEPROM_I2C_MADDR_0; - else if (strnstr(atom_ctx->vbios_version, + else if (strnstr(atom_ctx->vbios_pn, "D342", - sizeof(atom_ctx->vbios_version))) + sizeof(atom_ctx->vbios_pn))) control->i2c_address = EEPROM_I2C_MADDR_0; else control->i2c_address = EEPROM_I2C_MADDR_4; @@ -205,8 +205,8 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, control->i2c_address = EEPROM_I2C_MADDR_0; return true; case IP_VERSION(13, 0, 2): - if (strnstr(atom_ctx->vbios_version, "D673", - sizeof(atom_ctx->vbios_version))) + if (strnstr(atom_ctx->vbios_pn, "D673", + sizeof(atom_ctx->vbios_pn))) control->i2c_address = EEPROM_I2C_MADDR_4; else control->i2c_address = EEPROM_I2C_MADDR_0; diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c index 1c5d9388ad0b..9f63ddb89b75 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.c +++ b/drivers/gpu/drm/amd/amdgpu/atom.c @@ -1438,6 +1438,8 @@ static void atom_get_vbios_pn(struct atom_context *ctx) ctx->vbios_pn[count] = 0; } + + pr_info("ATOM BIOS: %s\n", ctx->vbios_pn); } static void atom_get_vbios_version(struct atom_context *ctx) @@ -1460,11 +1462,9 @@ struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios) int base; struct atom_context *ctx = kzalloc(sizeof(struct atom_context), GFP_KERNEL); - char *str; struct _ATOM_ROM_HEADER *atom_rom_header; struct _ATOM_MASTER_DATA_TABLE *master_table; struct _ATOM_FIRMWARE_INFO *atom_fw_info; - u16 idx; if (!ctx) return NULL; @@ -1502,16 +1502,6 @@ struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios) return NULL; } - idx = CU16(ATOM_ROM_PART_NUMBER_PTR); - if (idx == 0) - idx = 0x80; - - str = CSTR(idx); - if (*str != '\0') { - pr_info("ATOM BIOS: %s\n", str); - strlcpy(ctx->vbios_version, str, sizeof(ctx->vbios_version)); - } - atom_rom_header = (struct _ATOM_ROM_HEADER *)CSTR(base); if (atom_rom_header->usMasterDataTableOffset != 0) { master_table = (struct _ATOM_MASTER_DATA_TABLE *) diff --git a/drivers/gpu/drm/amd/amdgpu/atom.h b/drivers/gpu/drm/amd/amdgpu/atom.h index 0c1839824520..55bf99d5288d 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.h +++ b/drivers/gpu/drm/amd/amdgpu/atom.h @@ -33,7 +33,6 @@ struct drm_device; #define ATOM_ATI_MAGIC_PTR 0x30 #define ATOM_ATI_MAGIC " 761295520" #define ATOM_ROM_TABLE_PTR 0x48 -#define ATOM_ROM_PART_NUMBER_PTR 0x6E #define ATOM_ROM_MAGIC "ATOM" #define ATOM_ROM_MAGIC_PTR 4 @@ -143,7 +142,6 @@ struct atom_context { int io_mode; uint32_t *scratch; int scratch_size_bytes; - char vbios_version[20]; uint8_t name[STRLEN_LONG]; uint8_t vbios_pn[STRLEN_LONG]; -- cgit v1.2.3 From fcb7a1849aea4a31b7155d0816ad5070983da3d0 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Fri, 21 Jul 2023 16:38:02 +0800 Subject: drm/amdgpu: Check APU flag to disable RAS Only disable RAS by default for aqua vanjaram on APU platform. Signed-off-by: Stanley.Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0ba86cc88481..62011a521833 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2522,7 +2522,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) * Disable ras feature for aqua vanjaram * by default on apu platform. */ - if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6)) + if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) && + adev->gmc.is_app_apu) adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : adev->ras_hw_enabled & amdgpu_ras_mask; else -- cgit v1.2.3 From f957138cc30ae904346f117013fc7b428700c1f7 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 31 Jul 2023 23:43:12 +0800 Subject: drm/amdgpu: Only create err_count sysfs when hw_op is supported Some IP blocks only support partial ras feature and don't have ras counter and/or ras error status register at all. Driver should not create err_count sysfs node for those IP blocks. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 62011a521833..fa53a51db69f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2777,23 +2777,28 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, goto cleanup; } - r = amdgpu_ras_sysfs_create(adev, ras_block); - if (r) - goto interrupt; + if (ras_obj->hw_ops && + (ras_obj->hw_ops->query_ras_error_count || + ras_obj->hw_ops->query_ras_error_status)) { + r = amdgpu_ras_sysfs_create(adev, ras_block); + if (r) + goto interrupt; - /* Those are the cached values at init. - */ - query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL); - if (!query_info) - return -ENOMEM; - memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); + /* Those are the cached values at init. + */ + query_info = kzalloc(sizeof(*query_info), GFP_KERNEL); + if (!query_info) + return -ENOMEM; + memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { - atomic_set(&con->ras_ce_count, ce_count); - atomic_set(&con->ras_ue_count, ue_count); + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { + atomic_set(&con->ras_ce_count, ce_count); + atomic_set(&con->ras_ue_count, ue_count); + } + + kfree(query_info); } - kfree(query_info); return 0; interrupt: -- cgit v1.2.3 From 62c4b772bdd91ed1745c4a55b681e94aa9e48071 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 3 Jul 2023 14:38:45 +0800 Subject: drm/amdgpu: Apply poison mode check to GFX IP only For GFX IP that only supports poison consumption, GFX RAS won't be marked as enabled. i.e., hardware doesn't support gfx sram ecc. But driver still needs to issue firmware to enable poison consumption mode for GFX IP. In such case, check poison mode and treat GFX IP as RAS capable IP block. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fa53a51db69f..664b841f5cec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3153,6 +3153,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev, * that the ras block supports ras function. */ if (!ret && + block == AMDGPU_RAS_BLOCK__GFX && amdgpu_ras_is_poison_mode_supported(adev) && amdgpu_ras_get_ras_block(adev, block, 0)) ret = 1; -- cgit v1.2.3 From 6fc9d92c3d27f572eb1d884662f199c0d7f90d16 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 3 Jul 2023 16:17:46 +0800 Subject: drm/amdgpu: Issue ras enable_feature for gfx ip only For non-GFX IP blocks, set up ras obj if ras feature is allowed. For GFX IP blocks, force issue ras enable_feature command to firmware and only set up ras obj if ras feature is allowed Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 664b841f5cec..bb29cb57add5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -757,16 +757,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, return 0; } -static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev, - struct ras_common_if *head) -{ - if (amdgpu_ras_is_feature_allowed(adev, head) || - amdgpu_ras_is_poison_mode_supported(adev)) - return 1; - else - return 0; -} - /* wrapper of psp_ras_enable_features */ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, struct ras_common_if *head, bool enable) @@ -778,7 +768,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!con) return -EINVAL; - if (head->block == AMDGPU_RAS_BLOCK__GFX) { + /* Do not enable ras feature if it is not allowed */ + if (enable && + head->block != AMDGPU_RAS_BLOCK__GFX && + !amdgpu_ras_is_feature_allowed(adev, head)) + goto out; + + /* Only enable gfx ras feature from host side */ + if (head->block == AMDGPU_RAS_BLOCK__GFX && + !amdgpu_sriov_vf(adev) && + !amdgpu_ras_intr_triggered()) { info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); if (!info) return -ENOMEM; @@ -794,16 +793,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, .error_type = amdgpu_ras_error_to_ta(head->type), }; } - } - /* Do not enable if it is not allowed. */ - if (enable && !amdgpu_ras_check_feature_allowed(adev, head)) - goto out; - - /* Only enable ras feature operation handle on host side */ - if (head->block == AMDGPU_RAS_BLOCK__GFX && - !amdgpu_sriov_vf(adev) && - !amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", -- cgit v1.2.3 From 7692e1ee2446fd1940b5caa6e09779504a58881a Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 8 Jun 2023 16:25:42 +0800 Subject: drm/amdgpu: add RAS fatal error handler for NBIO v7.9 Register RAS fatal error interrupt and add handler. v2: only register NBIO RAS for dGPU platform. change nbio_v7_9_set_ras_controller_irq_state and nbio_v7_9_set_ras_err_event_athub_irq_state to dummy functions. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 187 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h | 1 + 3 files changed, 193 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bb29cb57add5..00658c2816dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,6 +35,7 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "nbio_v4_3.h" +#include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2644,6 +2645,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev) * check DF RAS */ adev->nbio.ras = &nbio_v4_3_ras; break; + case IP_VERSION(7, 9, 0): + if (!adev->gmc.is_app_apu) + adev->nbio.ras = &nbio_v7_9_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index 1d1ab188ef15..781f98655567 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .init_registers = nbio_v7_9_init_registers, .get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count, }; + +static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + return; +} + +static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_CNTLR_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + if (!ras->disable_ras_err_cnt_harvest) { + /* + * clear error status after ras_controller_intr + * according to hw team and count ue number + * for query + */ + nbio_v7_9_query_ras_error_count(adev, &err_data); + + /* logging on error cnt and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + get_ras_block_str(adev->nbio.ras_if)); + + if (err_data.ue_count) + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + get_ras_block_str(adev->nbio.ras_if)); + } + + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); + + /* ras_controller_int is dedicated for nbif ras error, + * not the global interrupt for sync flood + */ + amdgpu_ras_reset_gpu(adev); + } +} + +static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); + + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); + } +} + +static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* Dummy function, there is no initialization operation in driver */ + + return 0; +} + +static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for ras_controller_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* Dummy function, there is no initialization operation in driver */ + + return 0; +} + +static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = { + .set = nbio_v7_9_set_ras_controller_irq_state, + .process = nbio_v7_9_process_ras_controller_irq, +}; + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = { + .set = nbio_v7_9_set_ras_err_event_athub_irq_state, + .process = nbio_v7_9_process_err_event_athub_irq, +}; + +static int nbio_v7_9_init_ras_controller_interrupt (struct amdgpu_device *adev) +{ + int r; + + /* init the irq funcs */ + adev->nbio.ras_controller_irq.funcs = + &nbio_v7_9_ras_controller_irq_funcs; + adev->nbio.ras_controller_irq.num_types = 1; + + /* register ras controller interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT, + &adev->nbio.ras_controller_irq); + + return r; +} + +static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct amdgpu_device *adev) +{ + + int r; + + /* init the irq funcs */ + adev->nbio.ras_err_event_athub_irq.funcs = + &nbio_v7_9_ras_err_event_athub_irq_funcs; + adev->nbio.ras_err_event_athub_irq.num_types = 1; + + /* register ras err event athub interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, + &adev->nbio.ras_err_event_athub_irq); + + return r; +} + +const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = { + .query_ras_error_count = nbio_v7_9_query_ras_error_count, +}; + +struct amdgpu_nbio_ras nbio_v7_9_ras = { + .ras_block = { + .ras_comm = { + .name = "pcie_bif", + .block = AMDGPU_RAS_BLOCK__PCIE_BIF, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + }, + .hw_ops = &nbio_v7_9_ras_hw_ops, + .ras_late_init = amdgpu_nbio_ras_late_init, + }, + .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring, + .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring, + .init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt, + .init_ras_err_event_athub_interrupt = nbio_v7_9_init_ras_err_event_athub_interrupt, +}; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h index 8e04eb484328..73709771950d 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h @@ -28,5 +28,6 @@ extern const struct nbio_hdp_flush_reg nbio_v7_9_hdp_flush_reg; extern const struct amdgpu_nbio_funcs nbio_v7_9_funcs; +extern struct amdgpu_nbio_ras nbio_v7_9_ras; #endif -- cgit v1.2.3 From bc0f80802d735950738167e2cc0b51b0dd41e68d Mon Sep 17 00:00:00 2001 From: Candice Li Date: Tue, 8 Aug 2023 16:59:18 +0800 Subject: drm/amdgpu: Extend poison mode check to SDMA/VCN/JPEG Treat SDMA/VCN/JPEG as RAS capable IP blocks in poison mode. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 00658c2816dc..886c7c6f1c3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3148,7 +3148,10 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev, * that the ras block supports ras function. */ if (!ret && - block == AMDGPU_RAS_BLOCK__GFX && + (block == AMDGPU_RAS_BLOCK__GFX || + block == AMDGPU_RAS_BLOCK__SDMA || + block == AMDGPU_RAS_BLOCK__VCN || + block == AMDGPU_RAS_BLOCK__JPEG) && amdgpu_ras_is_poison_mode_supported(adev) && amdgpu_ras_get_ras_block(adev, block, 0)) ret = 1; -- cgit v1.2.3 From 8b3a7a707c6c5f7ccde47cf2427a560675cc5202 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 9 Aug 2023 19:07:50 +0800 Subject: drm/amdgpu: Remove unnecessary ras cap check RAS global isr will only be invoked by hardware interrupt. Don't need to query ras capability in isr In addition, amdgpu_ras_interrupt_fatal_error_handler ensures the isr won't be called from guest linux side by accident. The RAS cap check in isr that introduced to fix sriov crash is not needed any more Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 886c7c6f1c3b..da52d7c6d210 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2970,10 +2970,6 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { - amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_enabled) - return; - if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); -- cgit v1.2.3 From 1b98a5f8e04b944ba93444a8004690391a60fcf1 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 8 Aug 2023 10:02:30 +0800 Subject: drm/amdgpu: mode1 reset needs to recover mp1 for mp0 v13_0_10 Mode1 reset needs to recover mp1 in fatal error case for mp0 v13_0_10. v2: Define a macro to wrap psp function calls. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 24 +++++++++++++++++++++++- 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 3c7d9051b8c6..3384eb94fde0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -132,6 +132,7 @@ struct psp_funcs { int (*read_usbc_pd_fw)(struct psp_context *psp, uint32_t *fw_ver); int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr); int (*vbflash_stat)(struct psp_context *psp); + int (*fatal_error_recovery_quirk)(struct psp_context *psp); }; struct ta_funcs { @@ -445,6 +446,10 @@ struct amdgpu_psp_funcs { ((psp)->funcs->vbflash_stat ? \ (psp)->funcs->vbflash_stat((psp)) : -EINVAL) +#define psp_fatal_error_recovery_quirk(psp) \ + ((psp)->funcs->fatal_error_recovery_quirk ? \ + (psp)->funcs->fatal_error_recovery_quirk((psp)) : 0) + extern const struct amd_ip_funcs psp_ip_funcs; extern const struct amdgpu_ip_block_version psp_v3_1_ip_block; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index da52d7c6d210..7689395e44fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2064,6 +2064,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + psp_fatal_error_recovery_quirk(&adev->psp); } } diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 58db1ee631b3..10b17bd5aebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -691,6 +691,27 @@ static int psp_v13_0_vbflash_status(struct psp_context *psp) return RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_115); } +static int psp_v13_0_fatal_error_recovery_quirk(struct psp_context *psp) +{ + struct amdgpu_device *adev = psp->adev; + + if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 10)) { + uint32_t reg_data; + /* MP1 fatal error: trigger PSP dram read to unhalt PSP + * during MP1 triggered sync flood. + */ + reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_67); + WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_67, reg_data + 0x10); + + /* delay 1000ms for the mode1 reset for fatal error + * to be recovered back. + */ + msleep(1000); + } + + return 0; +} + static const struct psp_funcs psp_v13_0_funcs = { .init_microcode = psp_v13_0_init_microcode, .bootloader_load_kdb = psp_v13_0_bootloader_load_kdb, @@ -710,7 +731,8 @@ static const struct psp_funcs psp_v13_0_funcs = { .load_usbc_pd_fw = psp_v13_0_load_usbc_pd_fw, .read_usbc_pd_fw = psp_v13_0_read_usbc_pd_fw, .update_spirom = psp_v13_0_update_spirom, - .vbflash_stat = psp_v13_0_vbflash_status + .vbflash_stat = psp_v13_0_vbflash_status, + .fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk, }; void psp_v13_0_set_psp_funcs(struct psp_context *psp) -- cgit v1.2.3 From e81c45568505913a2275c6f60577e348d9786743 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 15 Aug 2023 17:39:30 +0800 Subject: drm/amdgpu: Enable ras for mp0 v13_0_6 sriov Enable ras for mp0 v13_0_6 sriov Signed-off-by: YiPeng Chai Reviewed-by: Stanley.Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 7689395e44fd..378478cf9c21 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2399,6 +2399,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) { switch (adev->ip_versions[MP0_HWIP][0]) { case IP_VERSION(13, 0, 2): + case IP_VERSION(13, 0, 6): return true; default: return false; -- cgit v1.2.3 From 9f051d6ff13fb20b424a86672db42746aa27d963 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 29 Aug 2023 23:20:27 +0800 Subject: drm/amdgpu: Free ras cmd input buffer properly Do not access the pointer for ras input cmd buffer if it is even not allocated. Signed-off-by: Hawking Zhang Reviewed-by: Stanley Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 378478cf9c21..3c4600e15b86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -764,7 +764,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); union ta_ras_cmd_input *info; - int ret = 0; + int ret; if (!con) return -EINVAL; @@ -773,7 +773,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (enable && head->block != AMDGPU_RAS_BLOCK__GFX && !amdgpu_ras_is_feature_allowed(adev, head)) - goto out; + return 0; /* Only enable gfx ras feature from host side */ if (head->block == AMDGPU_RAS_BLOCK__GFX && @@ -801,16 +801,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, enable ? "enable":"disable", get_ras_block_str(head), amdgpu_ras_is_poison_mode_supported(adev), ret); - goto out; + return ret; } + + kfree(info); } /* setup the obj */ __amdgpu_ras_feature_enable(adev, head, enable); -out: - if (head->block == AMDGPU_RAS_BLOCK__GFX) - kfree(info); - return ret; + + return 0; } /* Only used in device probe stage and called only once. */ -- cgit v1.2.3 From ffd6bde302061aeee405ab364403af30210f0b99 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 8 Sep 2023 21:21:55 +0800 Subject: drm/amdgpu: fallback to old RAS error message for aqua_vanjaram So driver doesn't generate incorrect message until the new format is settled down for aqua_vanjaram Signed-off-by: Hawking Zhang Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4600e15b86..937c54fc7174 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1052,7 +1052,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, info->ce_count = obj->err_data.ce_count; if (err_data.ce_count) { - if (adev->smuio.funcs && + if (!adev->aid_mask && + adev->smuio.funcs && adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_die_id) { dev_info(adev->dev, "socket: %d, die: %d " @@ -1072,7 +1073,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } } if (err_data.ue_count) { - if (adev->smuio.funcs && + if (!adev->aid_mask && + adev->smuio.funcs && adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_die_id) { dev_info(adev->dev, "socket: %d, die: %d " -- cgit v1.2.3