From b4559a16467a13a12a660f31edec1801e2a5b16b Mon Sep 17 00:00:00 2001 From: Tom St Denis Date: Tue, 4 Jun 2019 09:36:57 -0400 Subject: drm/amd/amdgpu: remove vram_page_split kernel option (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This option is no longer needed. The default code paths are now the only option. v2: Add HPAGE support and a default for non contiguous maps v3: Misread 512 pages as MiB ... Signed-off-by: Tom St Denis Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 78706dfa753a..b87b57f62c5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -108,7 +108,6 @@ int amdgpu_vm_fragment_size = -1; int amdgpu_vm_block_size = -1; int amdgpu_vm_fault_stop = 0; int amdgpu_vm_debug = 0; -int amdgpu_vram_page_split = 512; int amdgpu_vm_update_mode = -1; int amdgpu_exp_hw_support = 0; int amdgpu_dc = -1; @@ -342,13 +341,6 @@ module_param_named(vm_debug, amdgpu_vm_debug, int, 0644); MODULE_PARM_DESC(vm_update_mode, "VM update using CPU (0 = never (default except for large BAR(LB)), 1 = Graphics only, 2 = Compute only (default for LB), 3 = Both"); module_param_named(vm_update_mode, amdgpu_vm_update_mode, int, 0444); -/** - * DOC: vram_page_split (int) - * Override the number of pages after we split VRAM allocations (default 512, -1 = disable). The default is 512. - */ -MODULE_PARM_DESC(vram_page_split, "Number of pages after we split VRAM allocations (default 512, -1 = disable)"); -module_param_named(vram_page_split, amdgpu_vram_page_split, int, 0444); - /** * DOC: exp_hw_support (int) * Enable experimental hw support (1 = enable). The default is 0 (disabled). -- cgit v1.2.3 From ad4de27f48f0b2411f4a703db557a8d8c91b3a50 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Thu, 6 Jun 2019 08:44:37 -0400 Subject: drm/amdgpu: Add module parameter for specifying default ABM level [Why] It's non trivial to configure or specify an ABM reduction level for userspace outside of X. There is also no method to specify the default ABM value at boot time. A parameter should be added to configure this. [How] Expose a module parameter that can specify the default ABM level to use for eDP connectors on DC enabled hardware that loads the DMCU firmware. The default is still disabled (0), but levels can range from 1-4. Levels control how much the backlight can be reduced, with being the least amount of reduction and four being the most reduction. Signed-off-by: Nicholas Kazlauskas Reviewed-by: David Francis Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 113bcc574210..03fd37307896 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -154,6 +154,7 @@ extern int amdgpu_gpu_recovery; extern int amdgpu_emu_mode; extern uint amdgpu_smu_memory_pool_size; extern uint amdgpu_dc_feature_mask; +extern uint amdgpu_dm_abm_level; extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b87b57f62c5f..224c10a87d24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -678,6 +678,22 @@ MODULE_PARM_DESC(hws_gws_support, "MEC FW support gws barriers (false = not supp MODULE_PARM_DESC(dcfeaturemask, "all stable DC features enabled (default))"); module_param_named(dcfeaturemask, amdgpu_dc_feature_mask, uint, 0444); +/** + * DOC: abmlevel (uint) + * Override the default ABM (Adaptive Backlight Management) level used for DC + * enabled hardware. Requires DMCU to be supported and loaded. + * Valid levels are 0-4. A value of 0 indicates that ABM should be disabled by + * default. Values 1-4 control the maximum allowable brightness reduction via + * the ABM algorithm, with 1 being the least reduction and 4 being the most + * reduction. + * + * Defaults to 0, or disabled. Userspace can still override this level later + * after boot. + */ +uint amdgpu_dm_abm_level = 0; +MODULE_PARM_DESC(abmlevel, "ABM level (0 = off (default), 1-4 = backlight reduction level) "); +module_param_named(abmlevel, amdgpu_dm_abm_level, uint, 0444); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, -- cgit v1.2.3 From 635e2c5f0a1f7fa94cf8c8b027a5ff59f7af630c Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Tue, 4 Jun 2019 22:48:29 -0400 Subject: drm/amdgpu: bump the DRM version for GDS ENOMEM fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So userspace knows when this fix is available. Signed-off-by: Marek Olšák Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 224c10a87d24..f4d221ea5779 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -76,9 +76,10 @@ * - 3.30.0 - Add AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE. * - 3.31.0 - Add support for per-flip tiling attribute changes with DC * - 3.32.0 - Add syncobj timeline support to AMDGPU_CS. + * - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS. */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 32 +#define KMS_DRIVER_MINOR 33 #define KMS_DRIVER_PATCHLEVEL 0 #define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 -- cgit v1.2.3 From 51bcce4621d86b71d75f8f493d65e83b72748fce Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 31 Jul 2018 15:00:40 +0800 Subject: drm/amdgpu: add module parameter for async_gfx_ring enablement 0 means disable async_gfx_ring and is the default setting 1 means enable async_gfx_ring Signed-off-by: Hawking Zhang Reviewed-by: Jack Xiao Reviewed-by: Alex Deucher Acked-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 53c2f55b2ffc..221f420c0a74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -158,6 +158,7 @@ extern uint amdgpu_dm_abm_level; extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; +extern int amdgpu_async_gfx_ring; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index f4d221ea5779..7a2ba2b420d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -136,6 +136,7 @@ int amdgpu_emu_mode = 0; uint amdgpu_smu_memory_pool_size = 0; /* FBC (bit 0) disabled by default*/ uint amdgpu_dc_feature_mask = 0; +int amdgpu_async_gfx_ring = 0; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -565,6 +566,14 @@ MODULE_PARM_DESC(smu_memory_pool_size, "0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte"); module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 0444); +/** + * DOC: async_gfx_ring (int) + * It is used to enable gfx rings that could be configured with different prioritites or equal priorities + */ +MODULE_PARM_DESC(async_gfx_ring, + "Asynchronous GFX rings that could be configured with either different priorities (HP3D ring and LP3D ring), or equal priorities (0 = disabled (default), 1 = enabled)"); +module_param_named(async_gfx_ring, amdgpu_async_gfx_ring, int, 0444); + #ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) -- cgit v1.2.3 From 5bfca0692849b0faea3f884441a531bf88d0b525 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Thu, 21 Mar 2019 18:20:23 +0800 Subject: drm/amdgpu: enable async gfx ring by default VDDGFX requires gfx queue to be installed via MAP_QUEUES packet. Hence, enable async gfx ring by default. Signed-off-by: Jack Xiao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 7a2ba2b420d8..d1a0c013e27e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -136,7 +136,7 @@ int amdgpu_emu_mode = 0; uint amdgpu_smu_memory_pool_size = 0; /* FBC (bit 0) disabled by default*/ uint amdgpu_dc_feature_mask = 0; -int amdgpu_async_gfx_ring = 0; +int amdgpu_async_gfx_ring = 1; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -571,7 +571,7 @@ module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 0444 * It is used to enable gfx rings that could be configured with different prioritites or equal priorities */ MODULE_PARM_DESC(async_gfx_ring, - "Asynchronous GFX rings that could be configured with either different priorities (HP3D ring and LP3D ring), or equal priorities (0 = disabled (default), 1 = enabled)"); + "Asynchronous GFX rings that could be configured with either different priorities (HP3D ring and LP3D ring), or equal priorities (0 = disabled, 1 = enabled (default))"); module_param_named(async_gfx_ring, amdgpu_async_gfx_ring, int, 0444); #ifdef CONFIG_HSA_AMD -- cgit v1.2.3 From b239c01727459ba08c44b79e6225d3c58723f282 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Thu, 10 Jan 2019 15:43:33 +0800 Subject: drm/amdgpu: add mcbp driver parameter Add mcbp driver parameter, so that mcbp feature can be enabled/disabled by driver parameter. Acked-by: Hawking Zhang Signed-off-by: Jack Xiao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++++ 3 files changed, 9 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 221f420c0a74..63bb6249e455 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -159,6 +159,7 @@ extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; extern int amdgpu_async_gfx_ring; +extern int amdgpu_mcbp; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b2a9de32c61f..f83b8aa9da78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2575,6 +2575,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_device_get_pcie_info(adev); + if (amdgpu_mcbp) + DRM_INFO("MCBP is enabled\n"); + /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d1a0c013e27e..fc92f7106c6d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -137,6 +137,7 @@ uint amdgpu_smu_memory_pool_size = 0; /* FBC (bit 0) disabled by default*/ uint amdgpu_dc_feature_mask = 0; int amdgpu_async_gfx_ring = 1; +int amdgpu_mcbp = 0; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -574,6 +575,10 @@ MODULE_PARM_DESC(async_gfx_ring, "Asynchronous GFX rings that could be configured with either different priorities (HP3D ring and LP3D ring), or equal priorities (0 = disabled, 1 = enabled (default))"); module_param_named(async_gfx_ring, amdgpu_async_gfx_ring, int, 0444); +MODULE_PARM_DESC(mcbp, + "Enable Mid-command buffer preemption (0 = disabled (default), 1 = enabled)"); +module_param_named(mcbp, amdgpu_mcbp, int, 0444); + #ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) -- cgit v1.2.3 From a190d1c75c73ce7cab3dc7261061d877ffbd8517 Mon Sep 17 00:00:00 2001 From: Xiaojie Yuan Date: Wed, 27 Mar 2019 12:39:18 +0800 Subject: drm/amdgpu/discovery: add module param for ip discovery enablement to control enablement. Signed-off-by: Xiaojie Yuan Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++++ 3 files changed, 16 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index baadd009cdca..60b5ae7dac84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -161,6 +161,7 @@ extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; +extern int amdgpu_discovery; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 716e35a19d08..107a1950a711 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2578,6 +2578,14 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (amdgpu_mcbp) DRM_INFO("MCBP is enabled\n"); + if (amdgpu_discovery) { + r = amdgpu_discovery_init(adev); + if (r) { + dev_err(adev->dev, "amdgpu_discovery_init failed\n"); + return r; + } + } + /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) @@ -2838,6 +2846,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_ucode_sysfs_fini(adev); amdgpu_pmu_fini(adev); amdgpu_debugfs_preempt_cleanup(adev); + if (amdgpu_discovery) + amdgpu_discovery_fini(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index fc92f7106c6d..df1a298b2843 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -138,6 +138,7 @@ uint amdgpu_smu_memory_pool_size = 0; uint amdgpu_dc_feature_mask = 0; int amdgpu_async_gfx_ring = 1; int amdgpu_mcbp = 0; +int amdgpu_discovery = 0; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -579,6 +580,10 @@ MODULE_PARM_DESC(mcbp, "Enable Mid-command buffer preemption (0 = disabled (default), 1 = enabled)"); module_param_named(mcbp, amdgpu_mcbp, int, 0444); +MODULE_PARM_DESC(discovery, + "Allow driver to discover hardware IPs from IP Discovery table at the top of VRAM"); +module_param_named(discovery, amdgpu_discovery, int, 0444); + #ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) -- cgit v1.2.3 From 384872846f719760645dbe1b0c70edccb9cc941a Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Fri, 25 Jan 2019 14:11:35 +0800 Subject: drm/amdgpu/mes: add amdgpu_mes driver parameter amdgpu_mes, which is a driver scope parameter, is used to whether enable mes or not. MES (Micro Engine Scheduler) is the new on chip hw scheduling microcontroller. It can be used to handle queue scheduling and preemption and priorities. Acked-by: Hawking Zhang Signed-off-by: Jack Xiao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 60b5ae7dac84..48abe15d744a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -162,6 +162,7 @@ extern uint amdgpu_ras_mask; extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; +extern int amdgpu_mes; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index df1a298b2843..d23886bcc040 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -139,6 +139,7 @@ uint amdgpu_dc_feature_mask = 0; int amdgpu_async_gfx_ring = 1; int amdgpu_mcbp = 0; int amdgpu_discovery = 0; +int amdgpu_mes = 0; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -584,6 +585,10 @@ MODULE_PARM_DESC(discovery, "Allow driver to discover hardware IPs from IP Discovery table at the top of VRAM"); module_param_named(discovery, amdgpu_discovery, int, 0444); +MODULE_PARM_DESC(mes, + "Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)"); +module_param_named(mes, amdgpu_mes, int, 0444); + #ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) -- cgit v1.2.3 From 14328aa58ce523a59996c5a82681c43ec048cc33 Mon Sep 17 00:00:00 2001 From: Philip Cox Date: Wed, 29 May 2019 23:03:45 -0500 Subject: drm/amdkfd: Add navi10 support to amdkfd. (v3) KFD (kernel fusion driver) is the kernel driver for the compute backend for usermode compute stack. v2: squash in updates (Alex) v3: squash in rebase fixes (Alex) Signed-off-by: Oak Zeng Signed-off-by: Philip Cox Acked-by: Alex Deucher Signed-off-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 + drivers/gpu/drm/amd/amdkfd/Makefile | 3 + drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 299 ++++++ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 1124 ++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 5 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 31 +- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 15 +- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +- .../drm/amd/amdkfd/kfd_device_queue_manager_v10.c | 88 ++ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 3 + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c | 348 ++++++ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 498 +++++++++ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 21 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 + 18 files changed, 2442 insertions(+), 12 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d23886bcc040..2c985fa6a9bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -693,6 +693,14 @@ MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (defau bool hws_gws_support; module_param(hws_gws_support, bool, 0444); MODULE_PARM_DESC(hws_gws_support, "MEC FW support gws barriers (false = not supported (Default), true = supported)"); + +/** + * DOC: queue_preemption_timeout_ms (int) + * queue preemption timeout in ms (1 = Minimum, 9000 = default) + */ +int queue_preemption_timeout_ms; +module_param(queue_preemption_timeout_ms, int, 0644); +MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)"); #endif /** diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 69ec96998bb9..48155060a57c 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -36,16 +36,19 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \ $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \ $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_v10.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_process_queue_manager.o \ $(AMDKFD_PATH)/kfd_device_queue_manager.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index e413d4a71fa3..826913c70766 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -561,3 +561,302 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbf8a0000, 0x95806f6c, 0xbf810000, 0x00000000, }; + +static const uint32_t cwsr_trap_gfx10_hex[] = { + 0xbf820001, 0xbf82012e, + 0xb0804004, 0xb970f802, + 0x8a708670, 0xb971f803, + 0x8771ff71, 0x00000400, + 0xbf850008, 0xb971f803, + 0x8771ff71, 0x000001ff, + 0xbf850001, 0x806c846c, + 0x876dff6d, 0x0000ffff, + 0xbe80226c, 0xb971f803, + 0x8771ff71, 0x00000100, + 0xbf840006, 0xbef60380, + 0xb9f60203, 0x876dff6d, + 0x0000ffff, 0x80ec886c, + 0x82ed806d, 0xbef60380, + 0xb9f60283, 0xb973f816, + 0xb9762c07, 0x8f769c76, + 0x886d766d, 0xb97603c7, + 0x8f769b76, 0x886d766d, + 0xb976f807, 0x8776ff76, + 0x00007fff, 0xb9f6f807, + 0xbeee037e, 0xbeef037f, + 0xbefe0480, 0xbf900004, + 0xbf8e0002, 0xbf88fffe, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x00807fac, + 0x8776ff7f, 0x08000000, + 0x90768376, 0x88777677, + 0x8776ff7f, 0x70000000, + 0x90768176, 0x88777677, + 0xbefb037c, 0xbefa0380, + 0xb97202dc, 0x8872727f, + 0xbefe03c1, 0x877c8172, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb9712a05, + 0x80718171, 0x8f718271, + 0x877c8172, 0xbf06817c, + 0xbf85000d, 0x8f768771, + 0xbef603ff, 0x01000000, + 0xbefc0380, 0x7e008700, + 0xe0704000, 0x7a5d0000, + 0x807c817c, 0x807aff7a, + 0x00000080, 0xbf0a717c, + 0xbf85fff8, 0xbf82001b, + 0x8f768871, 0xbef603ff, + 0x01000000, 0xbefc0380, + 0x7e008700, 0xe0704000, + 0x7a5d0000, 0x807c817c, + 0x807aff7a, 0x00000100, + 0xbf0a717c, 0xbf85fff8, + 0xb9711e06, 0x8771c171, + 0xbf84000c, 0x8f718371, + 0x80717c71, 0xbefe03c1, + 0xbeff0380, 0x7e008700, + 0xe0704000, 0x7a5d0000, + 0x807c817c, 0x807aff7a, + 0x00000080, 0xbf0a717c, + 0xbf85fff8, 0xbf8a0000, + 0x8776ff72, 0x04000000, + 0xbf84002b, 0xbefe03c1, + 0x877c8172, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb9714306, 0x8771c171, + 0xbf840021, 0x8f718671, + 0x8f718271, 0xbef60371, + 0xbef603ff, 0x01000000, + 0xd7650000, 0x000100c1, + 0xd7660000, 0x000200c1, + 0x16000084, 0x877c8172, + 0xbf06817c, 0xbefc0380, + 0xbf85000a, 0x807cff7c, + 0x00000080, 0x807aff7a, + 0x00000080, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a717c, 0xbf85fff7, + 0xbf820009, 0x807cff7c, + 0x00000100, 0x807aff7a, + 0x00000100, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a717c, 0xbf85fff7, + 0x877c8172, 0xbf06817c, + 0xbf850003, 0x8f7687ff, + 0x0000006a, 0xbf820002, + 0x8f7688ff, 0x0000006a, + 0xbef603ff, 0x01000000, + 0x877c8172, 0xbf06817c, + 0xbefc0380, 0xbf800000, + 0xbf85000b, 0xbe802e00, + 0x7e000200, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x807c817c, + 0xbf0aff7c, 0x0000006a, + 0xbf85fff6, 0xbf82000a, + 0xbe802e00, 0x7e000200, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x807c817c, 0xbf0aff7c, + 0x0000006a, 0xbf85fff6, + 0xbef60384, 0xbef603ff, + 0x01000000, 0x877c8172, + 0xbf06817c, 0xbf850030, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x7e00026c, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0x7e00026d, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x7e00026e, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0x7e00026f, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x7e000270, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0xb971f803, 0x7e000271, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0x7e000273, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0xb97bf801, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0xbf82002f, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0x7e00026c, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x7e00026d, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0x7e00026e, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x7e00026f, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0x7e000270, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0xb971f803, 0x7e000271, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x7e000273, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0xb97bf801, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0xbf820119, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x00807fac, + 0x8772ff7f, 0x08000000, + 0x90728372, 0x88777277, + 0x8772ff7f, 0x70000000, + 0x90728172, 0x88777277, + 0xb97902dc, 0x8879797f, + 0xbef80380, 0xbefe03c1, + 0x877c8179, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f2a05, 0x806f816f, + 0x8f6f826f, 0x877c8179, + 0xbf06817c, 0xbf850013, + 0x8f76876f, 0xbef603ff, + 0x01000000, 0xbef20378, + 0x8078ff78, 0x00000080, + 0xbefc0381, 0xe0304000, + 0x785d0000, 0xbf8c3f70, + 0x7e008500, 0x807c817c, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff7, + 0xe0304000, 0x725d0000, + 0xbf820023, 0x8f76886f, + 0xbef603ff, 0x01000000, + 0xbef20378, 0x8078ff78, + 0x00000100, 0xbefc0381, + 0xe0304000, 0x785d0000, + 0xbf8c3f70, 0x7e008500, + 0x807c817c, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fff7, 0xb96f1e06, + 0x876fc16f, 0xbf84000e, + 0x8f6f836f, 0x806f7c6f, + 0xbefe03c1, 0xbeff0380, + 0xe0304000, 0x785d0000, + 0xbf8c3f70, 0x7e008500, + 0x807c817c, 0x8078ff78, + 0x00000080, 0xbf0a6f7c, + 0xbf85fff7, 0xbeff03c1, + 0xe0304000, 0x725d0000, + 0x8772ff79, 0x04000000, + 0xbf840020, 0xbefe03c1, + 0x877c8179, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f4306, 0x876fc16f, + 0xbf840016, 0x8f6f866f, + 0x8f6f826f, 0xbef6036f, + 0xbef603ff, 0x01000000, + 0x877c8172, 0xbf06817c, + 0xbefc0380, 0xbf850007, + 0x807cff7c, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fffa, + 0xbf820006, 0x807cff7c, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fffa, 0x877c8179, + 0xbf06817c, 0xbf850003, + 0x8f7687ff, 0x0000006a, + 0xbf820002, 0x8f7688ff, + 0x0000006a, 0xbef603ff, + 0x01000000, 0x877c8179, + 0xbf06817c, 0xbf850012, + 0xf4211cba, 0xf0000000, + 0x8078ff78, 0x00000080, + 0xbefc0381, 0xf421003a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xbf8cc07f, + 0xbe803000, 0xbf800000, + 0x807c817c, 0xbf0aff7c, + 0x0000006a, 0xbf85fff5, + 0xbe800372, 0xbf820011, + 0xf4211cba, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xbefc0381, 0xf421003a, + 0xf0000000, 0x8078ff78, + 0x00000100, 0xbf8cc07f, + 0xbe803000, 0xbf800000, + 0x807c817c, 0xbf0aff7c, + 0x0000006a, 0xbf85fff5, + 0xbe800372, 0xbef60384, + 0xbef603ff, 0x01000000, + 0x877c8179, 0xbf06817c, + 0xbf850025, 0xf4211bfa, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211b3a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211b7a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211eba, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211efa, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211c3a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211c7a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211cfa, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211e7a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xbf820024, + 0xf4211bfa, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211b3a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211b7a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211eba, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211efa, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211c3a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211c7a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211cfa, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211e7a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xbf8cc07f, 0x876dff6d, + 0x0000ffff, 0xbefc036f, + 0xbefe037a, 0xbeff037b, + 0x876f71ff, 0x000003ff, + 0xb9ef4803, 0xb9f3f816, + 0x876f71ff, 0xfffff800, + 0x906f8b6f, 0xb9efa2c3, + 0xb9f9f801, 0x876fff6d, + 0xf0000000, 0x906f9c6f, + 0x8f6f906f, 0xbef20380, + 0x88726f72, 0x876fff6d, + 0x08000000, 0x906f9b6f, + 0x8f6f8f6f, 0x88726f72, + 0x876fff70, 0x00800000, + 0x906f976f, 0xb9f2f807, + 0xb9f0f802, 0xbf8a0000, + 0xbe80226c, 0xbf810000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm new file mode 100644 index 000000000000..f20e463e748b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -0,0 +1,1124 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +shader main + +asic(DEFAULT) + +type(CS) + +wave_size(32) +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 0 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x9000 //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC) +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing +var SAVE_RESTORE_HWID_DDID = 0 +var RESTORE_DDID_IN_SGPR18 = 0 +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 +var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 +var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME +var SQ_WAVE_IB_STS_RCNT_SIZE = 6 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_status = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_wave_size = ttmp6 //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK +var s_save_xnack_mask = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 + +var s_save_mem_offset = ttmp14 +var s_sgpr_save_num = 106 //in gfx10, all sgpr must be saved +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +var s_save_m0 = ttmp15 + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp6 +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp13 +var s_restore_hwid1 = ttmp2 +var s_restore_ddid = s_restore_hwid1 +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp14 +var s_restore_exec_hi = ttmp15 +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +//var s_restore_xnack_mask_lo = xnack_mask_lo +//var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_xnack_mask = ttmp7 +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_size = ttmp13 //ttmp13 has no conflict + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* + if (!EMU_RUN_HACK) + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + + L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + s_rfe_b64 [ttmp0, ttmp1] + end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + + //check whether there is mem_viol + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_PC_REWIND + + //if so, need rewind PC assuming GDS operation gets NACKed + s_mov_b32 s_save_tmp, 0 //clear mem_viol bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 + s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc + +L_NO_PC_REWIND: + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + //s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK + //s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + L_SLEEP: + s_sleep 0x2 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + + + /* setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + //determine it is wave32 or wave64 + s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_cmp_eq_u32 s_wave_size, 0 + s_cbranch_scc1 L_SAVE_WAVE32 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64 + s_branch L_SAVE_CON + L_SAVE_WAVE32: + s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32 + L_SAVE_CON: + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size + s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi + + /* save VGPRs */ + ////////////////////////////// + L_SAVE_VGPR: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_VGPR_NORMAL + L_ENABLE_SAVE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_SAVE_VGPR_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + //for wave32 and wave64, the num of vgpr function is the same? + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + //determine it is wave32 or wave64 + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64 + + //zhenxu added it for save vgpr for wave32 + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //VGPR initial index value =0 + //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10 + + L_SAVE_VGPR_WAVE32_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + end + + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_WAVE32_LOOP //VGPR save is complete? + s_branch L_SAVE_LDS + //save vgpr for wave32 ends + + L_SAVE_VGPR_WAVE64: + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //VGPR initial index value =0 + //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10 + + L_SAVE_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + end + + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64_LOOP //VGPR save is complete? + //s_set_gpr_idx_off + // + //Below part will be the save shared vgpr part (new for gfx10) + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_SAVE_LDS //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //save shared_vgpr will start from the index of m0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 + L_SAVE_SHARED_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? + + /* save LDS */ + ////////////////////////////// + L_SAVE_LDS: + + //Only check the first wave need LDS + /* the first wave in the threadgroup */ + s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG" + s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_SGPR + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_LDS_NORMAL + L_ENABLE_SAVE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_SAVE_LDS_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_SGPR //no lds used? jump to L_SAVE_VGPR + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + //load 0~63*4(byte address) to vgpr v15 + v_mbcnt_lo_u32_b32 v0, -1, 0 + v_mbcnt_hi_u32_b32 v0, -1, v0 + v_mul_u32_u24 v0, 4, v0 + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 + + L_SAVE_LDS_LOOP_W32: + if (SAVE_LDS) + ds_read_b32 v1, v0 + s_waitcnt 0 //ensure data ready + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10 + end + s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //mem offset increased by 128 bytes + v_add_nc_u32 v0, v0, 128 + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? + s_branch L_SAVE_SGPR + + L_SAVE_LDS_LOOP_W64: + if (SAVE_LDS) + ds_read_b32 v1, v0 + s_waitcnt 0 //ensure data ready + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10 + end + s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes + v_add_nc_u32 v0, v0, 256 + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? + + + /* save SGPRs */ + ////////////////////////////// + //s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + //s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value) + L_SAVE_SGPR: + //need to look at it is wave32 or wave64 + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_SGPR_VMEM_WAVE64 + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads) + end + s_branch L_SAVE_SGPR_CONT + L_SAVE_SGPR_VMEM_WAVE64: + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads) + end + L_SAVE_SGPR_CONT: + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + //s_mov_b32 m0, 0x0 //SGPR initial index value =0 + //s_nop 0x0 //Manually inserted wait states + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states + + s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 + + L_SAVE_SGPR_LOOP_WAVE32: + s_movrels_b32 s0, s0 //s0 = s[0+m0] + //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change + write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE32 //SGPR save is complete? + s_branch L_SAVE_HWREG + + L_SAVE_SGPR_LOOP_WAVE64: + s_movrels_b32 s0, s0 //s0 = s[0+m0] + //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change + write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 //SGPR save is complete? + + + /* save HW registers */ + ////////////////////////////// + L_SAVE_HWREG: + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_HWREG_WAVE64 + + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + end + + write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC + write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC + write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS + + //write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO + write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + if(SAVE_RESTORE_HWID_DDID) + s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + end + s_branch L_S_PGM_END_SAVED + + L_SAVE_HWREG_WAVE64: + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + end + + write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC + write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC + write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS + + //write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO + write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + + if(SAVE_RESTORE_HWID_DDID) + s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + /* save DDID */ + ////////////////////////////// + L_SAVE_DDID: + //EXEC has been saved, no vector inst following + s_mov_b32 exec_lo, 0x80000000 //Set MSB to 1. Cleared when draw index is returned + s_sendmsg sendmsg(MSG_GET_DDID) + + L_WAIT_DDID_LOOP: + s_nop 7 // sleep a bit + s_bitcmp0_b32 exec_lo, 31 // test to see if MSB is cleared, meaning done + s_cbranch_scc0 L_WAIT_DDID_LOOP + + s_mov_b32 s_save_m0, exec_lo + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_DDID_WAVE64 + + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + L_SAVE_DDID_WAVE64: + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + end + + L_S_PGM_END_SAVED: + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + //determine it is wave32 or wave64 + s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13 + s_cmp_eq_u32 s_restore_size, 0 + s_cbranch_scc1 L_RESTORE_WAVE32 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64 + s_branch L_RESTORE_CON + L_RESTORE_WAVE32: + s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32 + L_RESTORE_CON: + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + //determine it is wave32 or wave64 + s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size //share s_wave_size with exec_hi + + /* global mem offset */ + s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_VGPR_NORMAL + L_ENABLE_RESTORE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_RESTORE_VGPR_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 + + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 + s_mov_b32 m0, 1 //VGPR initial index value = 1 + //s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later, might not need this in gfx10 + + L_RESTORE_VGPR_WAVE32_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + end + s_waitcnt vmcnt(0) //ensure data ready + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 128 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? + //s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + end + + s_branch L_RESTORE_LDS + + L_RESTORE_VGPR_WAVE64: + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 + s_mov_b32 m0, 1 //VGPR initial index value = 1 + L_RESTORE_VGPR_WAVE64_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + end + s_waitcnt vmcnt(0) //ensure data ready + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + //s_set_gpr_idx_off + // + //Below part will be the restore shared vgpr part (new for gfx10) + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //restore shared_vgpr will start from the index of m0 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 + L_RESTORE_SHARED_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) //ensure data ready + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + + s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! + + /* VGPR restore on v0 */ + L_RESTORE_V0: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + end + + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + //Only need to check the first wave + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_SGPR + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_LDS_NORMAL + L_ENABLE_RESTORE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_RESTORE_LDS_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_SGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 + + L_RESTORE_LDS_LOOP_W32: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 + s_waitcnt 0 + end + s_add_u32 m0, m0, 128 //every buffer_load_dword does 256 bytes + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? + s_branch L_RESTORE_SGPR + + L_RESTORE_LDS_LOOP_W64: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 + s_waitcnt 0 + end + s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? + + + /* restore SGPRs */ + ////////////////////////////// + //s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value) + L_RESTORE_SGPR: + //need to look at it is wave32 or wave64 + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_SGPR_VMEM_WAVE64 + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads) + end + s_branch L_RESTORE_SGPR_CONT + L_RESTORE_SGPR_VMEM_WAVE64: + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads) + end + + L_RESTORE_SGPR_CONT: + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_SGPR_WAVE64 + + read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp + s_mov_b32 m0, 0x1 + + L_RESTORE_SGPR_LOOP_WAVE32: + read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + s_movreld_b32 s0, s0 //s[0+m0] = s0 + s_nop 0 // hazard SALU M0=> S_MOVREL + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE32 //SGPR restore (except s0) is complete? + s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */ + s_branch L_RESTORE_HWREG + + L_RESTORE_SGPR_WAVE64: + read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp + s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1 + + L_RESTORE_SGPR_LOOP_WAVE64: + read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + s_movreld_b32 s0, s0 //s[0+m0] = s0 + s_nop 0 // hazard SALU M0=> S_MOVREL + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE64 //SGPR restore (except s0) is complete? + s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */ + + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_HWREG_WAVE64 + + read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0 + read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC + read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC + read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS + read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS + //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO + //read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI + read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK + read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE + if(SAVE_RESTORE_HWID_DDID) + read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1 + end + s_branch L_RESTORE_HWREG_FINISH + + L_RESTORE_HWREG_WAVE64: + read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0 + read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC + read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC + read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS + read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS + //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO + //read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI + read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK + read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE + if(SAVE_RESTORE_HWID_DDID) + read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1 + end + L_RESTORE_HWREG_FINISH: + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + + + if(SAVE_RESTORE_HWID_DDID) + L_RESTORE_DDID: + s_mov_b32 m0, s_restore_hwid1 //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave + s_ttracedata //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_DDID_WAVE64 + + read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + s_branch L_RESTORE_DDID_FINISH + L_RESTORE_DDID_WAVE64: + read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + + L_RESTORE_DDID_FINISH: + s_waitcnt lgkmcnt(0) + //s_mov_b32 m0, s_restore_ddid + //s_ttracedata + if (RESTORE_DDID_IN_SGPR18) + s_mov_b32 s18, s_restore_ddid + end + + end + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask //restore xnack_mask + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status + + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time + + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_b64 s_restore_pc_lo // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ +function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf) + if (use_sqc) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo + elsif (use_mtbuf) + v_mov_b32 v0, s + tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 128 + else + v_mov_b32 v0, s + buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 128 + end +end + +function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf) + if (use_sqc) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo + elsif (use_mtbuf) + v_mov_b32 v0, s + tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 256 + else + v_mov_b32 v0, s + buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 256 + end +end + +function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + if (use_sqc) + s_add_u32 s_mem_offset, s_mem_offset, 4 + else + s_add_u32 s_mem_offset, s_mem_offset, 128 + end +end + +function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + if (use_sqc) + s_add_u32 s_mem_offset, s_mem_offset, 4 + else + s_add_u32 s_mem_offset, s_mem_offset, 256 + end +end + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 59f8ca4297db..792371442195 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -138,6 +138,8 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { /* TODO - check & update Vega10 cache details */ #define vega10_cache_info carrizo_cache_info #define raven_cache_info carrizo_cache_info +/* TODO - check & update Navi10 cache details */ +#define navi10_cache_info carrizo_cache_info static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, struct crat_subtype_computeunit *cu) @@ -666,6 +668,9 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, case CHIP_RAVEN: pcache_info = raven_cache_info; num_of_cache_types = ARRAY_SIZE(raven_cache_info); + case CHIP_NAVI10: + pcache_info = navi10_cache_info; + num_of_cache_types = ARRAY_SIZE(navi10_cache_info); break; default: return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index ebac7d7f9956..26ea46de3722 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -317,6 +317,23 @@ static const struct kfd_device_info vega20_device_info = { .num_sdma_queues_per_engine = 8, }; +static const struct kfd_device_info navi10_device_info = { + .asic_family = CHIP_NAVI10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + struct kfd_deviceid { unsigned short did; const struct kfd_device_info *device_info; @@ -434,7 +451,13 @@ static const struct kfd_deviceid supported_devices[] = { { 0x66a3, &vega20_device_info }, /* Vega20 */ { 0x66a4, &vega20_device_info }, /* Vega20 */ { 0x66a7, &vega20_device_info }, /* Vega20 */ - { 0x66af, &vega20_device_info } /* Vega20 */ + { 0x66af, &vega20_device_info }, /* Vega20 */ + /* Navi10 */ + { 0x7310, &navi10_device_info }, /* Navi10 */ + { 0x7312, &navi10_device_info }, /* Navi10 */ + { 0x7318, &navi10_device_info }, /* Navi10 */ + { 0x731a, &navi10_device_info }, /* Navi10 */ + { 0x731f, &navi10_device_info }, /* Navi10 */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -516,10 +539,14 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx8_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); - } else { + } else if (kfd->device_info->asic_family < CHIP_NAVI10) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx10_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); } kfd->cwsr_enabled = true; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 3528590ae90b..e83b7e9734de 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1264,6 +1264,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) return 0; retval = pm_send_runlist(&dqm->packets, &dqm->queues); + pr_debug("%s sent runlist\n", __func__); if (retval) { pr_err("failed to execute runlist\n"); return retval; @@ -1301,7 +1302,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, KFD_FENCE_COMPLETED); /* should be timed out */ retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, - QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + queue_preemption_timeout_ms); if (retval) return retval; @@ -1785,6 +1786,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_RAVEN: device_queue_manager_init_v9(&dqm->asic_ops); break; + case CHIP_NAVI10: + device_queue_manager_init_v10_navi10(&dqm->asic_ops); + break; default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); @@ -1876,12 +1880,13 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) int r = 0; r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, - KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs); + KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, + &dump, &n_regs); if (!r) { seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n", - KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, - KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), - KFD_CIK_HIQ_QUEUE); + KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, + KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), + KFD_CIK_HIQ_QUEUE); seq_reg_dump(m, dump, n_regs); kfree(dump); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 88b4c007696e..90db2c9275f6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -31,8 +31,6 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" -#define KFD_UNMAP_LATENCY_MS (4000) -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) struct device_process_node { struct qcm_process_device *qpd; @@ -212,6 +210,8 @@ void device_queue_manager_init_vi_tonga( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v9( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v10_navi10( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_queues_num(struct device_queue_manager *dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c new file mode 100644 index 000000000000..72e4d61ac752 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c @@ -0,0 +1,88 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "navi10_enum.h" +#include "gc/gc_10_1_0_offset.h" +#include "gc/gc_10_1_0_sh_mask.h" + +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v10_navi10( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v10; + asic_ops->init_sdma_vm = init_sdma_vm_v10; + asic_ops->mqd_manager_init = mqd_manager_init_v10; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; +#if 0 + /* TODO: + * This shouldn't be an issue with Navi10. Verify. + */ + if (vega10_noretry) + qpd->sh_mem_config |= + 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; +#endif + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 22a8e88b6a67..60521366dd31 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -405,6 +405,7 @@ int kfd_init_apertures(struct kfd_process *process) case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_NAVI10: kfd_init_apertures_v9(pdd, id); break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 229500c8c958..29c0bd2d7a5c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -332,6 +332,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, case CHIP_RAVEN: kernel_queue_init_v9(&kq->ops_asic_specific); break; + case CHIP_NAVI10: + kernel_queue_init_v10(&kq->ops_asic_specific); + break; default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index a7116a939029..365fc674fea4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -102,5 +102,6 @@ struct kernel_queue { void kernel_queue_init_cik(struct kernel_queue_ops *ops); void kernel_queue_init_vi(struct kernel_queue_ops *ops); void kernel_queue_init_v9(struct kernel_queue_ops *ops); +void kernel_queue_init_v10(struct kernel_queue_ops *ops); #endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c new file mode 100644 index 000000000000..aed32ab7102e --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c @@ -0,0 +1,348 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers_ai.h" +#include "kfd_pm4_opcodes.h" +#include "gc/gc_10_1_0_sh_mask.h" + +static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_v10(struct kernel_queue *kq); +static void submit_packet_v10(struct kernel_queue *kq); + +void kernel_queue_init_v10(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_v10; + ops->uninitialize = uninitialize_v10; + ops->submit_packet = submit_packet_v10; +} + +static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + int retval; + + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); + if (retval != 0) + return false; + + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); + + return true; +} + +static void uninitialize_v10(struct kernel_queue *kq) +{ + kfd_gtt_sa_free(kq->dev, kq->eop_mem); +} + +static void submit_packet_v10(struct kernel_queue *kq) +{ + *kq->wptr64_kernel = kq->pending_wptr64; + write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, + kq->pending_wptr64); +} + +static int pm_map_process_v10(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_mes_map_process *packet; + uint64_t vm_page_table_base_addr = qpd->page_table_base; + + packet = (struct pm4_mes_map_process *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_mes_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.num_gws = qpd->num_gws; + packet->bitfields14.num_oac = qpd->num_oac; + packet->bitfields14.sdma_enable = 1; + + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + if (qpd->tba_addr) { + packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tba_hi = (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT) | + upper_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); + packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); + } + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + packet->vm_context_page_table_base_addr_lo32 = + lower_32_bits(vm_page_table_base_addr); + packet->vm_context_page_table_base_addr_hi32 = + upper_32_bits(vm_page_table_base_addr); + + return 0; +} + +static int pm_runlist_v10(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_mes_runlist *packet; + + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; + + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + + + packet = (struct pm4_mes_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_mes_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; + packet->ordinal2 = lower_32_bits(ib); + packet->ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_map_queues_v10(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_mes_map_queues)); + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + WARN(1, "queue type %d\n", q->properties.type); + return -EINVAL; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_unmap_queues_v10(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + struct pm4_mes_unmap_queues *packet; + + packet = (struct pm4_mes_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_mes_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + WARN(1, "queue type %d\n", type); + break; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (filter) { + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_queues; + break; + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; + break; + default: + WARN(1, "filter %d\n", filter); + break; + } + + return 0; + +} + +static int pm_query_status_v10(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value) +{ + struct pm4_mes_query_status *packet; + + packet = (struct pm4_mes_query_status *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); + + + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_mes_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + return 0; +} + + +static int pm_release_mem_v10(uint64_t gpu_addr, uint32_t *buffer) +{ + struct pm4_mec_release_mem *packet; + + WARN_ON(!buffer); + + packet = (struct pm4_mec_release_mem *)buffer; + memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); + + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, + sizeof(struct pm4_mec_release_mem)); + + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; + packet->bitfields2.tcl1_action_ena = 1; + packet->bitfields2.tc_action_ena = 1; + packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; + + packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; + packet->bitfields3.int_sel = + int_sel__mec_release_mem__send_interrupt_after_write_confirm; + + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; + packet->address_hi = upper_32_bits(gpu_addr); + + packet->data_lo = 0; + + return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +} + +const struct packet_manager_funcs kfd_v10_pm_funcs = { + .map_process = pm_map_process_v10, + .runlist = pm_runlist_v10, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_v10, + .unmap_queues = pm_unmap_queues_v10, + .query_status = pm_query_status_v10, + .release_mem = pm_release_mem_v10, + .map_process_size = sizeof(struct pm4_mes_map_process), + .runlist_size = sizeof(struct pm4_mes_runlist), + .set_resources_size = sizeof(struct pm4_mes_set_resources), + .map_queues_size = sizeof(struct pm4_mes_map_queues), + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), + .query_status_size = sizeof(struct pm4_mes_query_status), + .release_mem_size = sizeof(struct pm4_mec_release_mem) +}; + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c new file mode 100644 index 000000000000..4f8a6ffc5775 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -0,0 +1,498 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v10_structs.h" +#include "gc/gc_10_1_0_offset.h" +#include "gc/gc_10_1_0_sh_mask.h" +#include "amdgpu_amdkfd.h" + +static inline struct v10_compute_mqd *get_mqd(void *mqd) +{ + return (struct v10_compute_mqd *)mqd; +} + +static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v10_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (q->cu_mask_count == 0) + return; + + mqd_symmetrically_map_cu_mask(mm, + q->cu_mask, q->cu_mask_count, se_mask); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + + pr_debug("update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + int retval; + struct kfd_mem_obj *mqd_mem_obj = NULL; + + /* From V9, for CWSR, the control stack is located on the next page + * boundary after the mqd, we will use the gtt allocation function + * instead of sub-allocation function. + */ + if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); + if (!mqd_mem_obj) + return NULL; + retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd, + ALIGN(q->ctl_stack_size, PAGE_SIZE) + + ALIGN(sizeof(struct v10_compute_mqd), PAGE_SIZE), + &(mqd_mem_obj->gtt_mem), + &(mqd_mem_obj->gpu_addr), + (void *)&(mqd_mem_obj->cpu_ptr), true); + } else { + retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd), + &mqd_mem_obj); + } + + if (retval) { + kfree(mqd_mem_obj); + return NULL; + } + + return mqd_mem_obj; + +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v10_compute_mqd *m; + + m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, sizeof(struct v10_compute_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy + (mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +static void free_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + struct kfd_dev *kfd = mm->dev; + + if (mqd_mem_obj->gtt_mem) { + amdgpu_amdkfd_free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); + kfree(mqd_mem_obj); + } else { + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_is_occupied( + mm->dev->kgd, queue_address, + pipe_id, queue_id); +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v10_compute_mqd *m; + + /* Control stack is located one page after MQD. */ + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + + m = get_mqd(mqd); + + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + update_mqd(mm, mqd, q); + + /* TODO: what's the point? update_mqd already does this. */ + m = get_mqd(mqd); + m->cp_hqd_vmid = q->vmid; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v10_sdma_mqd *m; + + m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, sizeof(struct v10_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v10_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v10_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + pr_debug("%s@%i\n", __func__, __LINE__); + case KFD_MQD_TYPE_COMPUTE: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd; + mqd->free_mqd = free_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); + mqd->get_wave_state = get_wave_state; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_sdma_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct v10_sdma_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 808194663a7d..c72c8f5fd54c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -237,6 +237,9 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_RAVEN: pm->pmf = &kfd_v9_pm_funcs; break; + case CHIP_NAVI10: + pm->pmf = &kfd_v10_pm_funcs; + break; default: WARN(1, "Unexpected ASIC family %u", dqm->dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index bd01396c8cea..f88c0d706b1d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -105,6 +105,8 @@ #define KFD_KERNEL_QUEUE_SIZE 2048 +#define KFD_UNMAP_LATENCY_MS (4000) + /* * 512 = 0x200 * The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the @@ -167,11 +169,20 @@ extern int halt_if_hws_hang; */ extern bool hws_gws_support; +/* + * Queue preemption timeout in ms + */ +extern int queue_preemption_timeout_ms; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent }; +#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_POLARIS11) +#define KFD_IS_DGPU(chip) (((chip) >= CHIP_TONGA && \ + (chip) <= CHIP_NAVI10) || \ + (chip) == CHIP_HAWAII) #define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) struct kfd_event_interrupt_class { @@ -870,6 +881,8 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, @@ -909,8 +922,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm, u32 *save_area_used_size); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - unsigned int fence_value, - unsigned int timeout_ms); + unsigned int fence_value, + unsigned int timeout_ms); /* Packet Manager */ @@ -959,6 +972,7 @@ struct packet_manager_funcs { extern const struct packet_manager_funcs kfd_vi_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs; +extern const struct packet_manager_funcs kfd_v10_pm_funcs; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); void pm_uninit(struct packet_manager *pm); @@ -978,7 +992,8 @@ void pm_release_ib(struct packet_manager *pm); /* Following PM funcs can be shared among VI and AI */ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); + struct scheduling_resources *res); + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index ed2d83f93fd8..8f1076c0c88a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1203,3 +1203,4 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) } #endif + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 2c40ab4fe8de..c2e6e47abaf2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1321,6 +1321,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_NAVI10: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -- cgit v1.2.3 From bd1c0fdfc1a166859ad926dce2f12e3155b65e7d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 19 Apr 2019 17:58:21 -0500 Subject: drm/amdgpu: add Navi10 pci ids Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 2c985fa6a9bc..6920e2372632 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -983,6 +983,12 @@ static const struct pci_device_id pciidlist[] = { /* Raven */ {0x1002, 0x15dd, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU}, {0x1002, 0x15d8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU}, + /* Navi10 */ + {0x1002, 0x7310, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, + {0x1002, 0x7312, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, + {0x1002, 0x7318, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, + {0x1002, 0x731A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, + {0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0, 0, 0} }; -- cgit v1.2.3 From 54b998ca8d1ccbf626b398ff6ad0d1dfa771e197 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 1 Apr 2019 15:32:48 +0800 Subject: drm/amdgpu: enable sw smu driver for navi10 by default Navi10 will use sw smu driver for dynamic power managment, while vega20 could also use sw smu driver when amdgpu_dpm is set to 2 Signed-off-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 +++- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 6920e2372632..cb081f2e6829 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -251,7 +251,9 @@ module_param_string(lockup_timeout, amdgpu_lockup_timeout, sizeof(amdgpu_lockup_ /** * DOC: dpm (int) - * Override for dynamic power management setting (1 = enable, 0 = disable). The default is -1 (auto). + * Override for dynamic power management setting + * (0 = disable, 1 = enable, 2 = enable sw smu driver for vega20) + * The default is -1 (auto). */ MODULE_PARM_DESC(dpm, "DPM support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(dpm, amdgpu_dpm, int, 0444); diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 63df59e7335d..785321da1141 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -168,13 +168,12 @@ int smu_update_table_with_arg(struct smu_context *smu, uint16_t table_id, uint16 bool is_support_sw_smu(struct amdgpu_device *adev) { - if (amdgpu_dpm != 1) - return false; - - if (adev->asic_type >= CHIP_VEGA20 && adev->asic_type != CHIP_RAVEN) + if (adev->asic_type == CHIP_VEGA20) + return (amdgpu_dpm == 2) ? true : false; + else if (adev->asic_type >= CHIP_NAVI10) return true; - - return false; + else + return false; } int smu_sys_get_pp_table(struct smu_context *smu, void **table) -- cgit v1.2.3 From 4f56d9d412ed8f2edb15d54d24bdf87c4c4e1fc0 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Thu, 30 May 2019 11:04:11 +0800 Subject: drm/amdgpu: add new navi10 DIDs Reviewed-by: Jack Xiao Signed-off-by: tiancyin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index cb081f2e6829..d8750d9daf96 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -989,7 +989,9 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x7310, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0x1002, 0x7312, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0x1002, 0x7318, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, + {0x1002, 0x7319, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0x1002, 0x731A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, + {0x1002, 0x731B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0, 0, 0} -- cgit v1.2.3