From a6bef67e2abae4f19de8fb4ec34acb1df8156a75 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Mon, 24 Apr 2017 17:39:00 +0800 Subject: drm/amdgpu: fix NULL pointer error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ 141.420491] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 [ 141.420532] IP: [] fence_remove_callback+0x11/0x60 [ 141.420563] PGD 20a030067 [ 141.420575] PUD 2088ca067 [ 141.420587] PMD 0 [ 141.420599] Oops: 0000 [#1] SMP [ 141.420612] Modules linked in: amdgpu(OE) ttm(OE) drm_kms_helper(E) drm(E) i2c_algo_bit(E) fb_sys_fops(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) rpcsec_gss_krb5(E) nfsv4(E) nfs(E) fscache(E) eeepc_wmi(E) asus_wmi(E) sparse_keymap(E) snd_hda_codec_realtek(E) video(E) snd_hda_codec_generic(E) snd_hda_codec_hdmi(E) snd_hda_intel(E) joydev(E) snd_hda_codec(E) snd_seq_midi(E) snd_seq_midi_event(E) snd_hda_core(E) snd_hwdep(E) snd_rawmidi(E) snd_pcm(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) snd_seq(E) crc32_pclmul(E) ghash_clmulni_intel(E) snd_seq_device(E) snd_timer(E) aesni_intel(E) aes_x86_64(E) lrw(E) gf128mul(E) glue_helper(E) ablk_helper(E) cryptd(E) snd(E) soundcore(E) serio_raw(E) shpchp(E) i2c_piix4(E) i2c_designware_platform(E) 8250_dw(E) i2c_designware_core(E) mac_hid(E) binfmt_misc(E) [ 141.420948] nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) parport_pc(E) ppdev(E) lp(E) parport(E) autofs4(E) hid_generic(E) usbhid(E) hid(E) psmouse(E) r8169(E) ahci(E) mii(E) libahci(E) wmi(E) [ 141.421042] CPU: 14 PID: 223 Comm: kworker/14:2 Tainted: G OE 4.9.0-custom #4 [ 141.421074] Hardware name: System manufacturer System Product Name/PRIME B350-PLUS, BIOS 0606 04/06/2017 [ 141.421146] Workqueue: events amd_sched_job_timedout [amdgpu] [ 141.421169] task: ffff88020b03ba80 task.stack: ffffc900016f4000 [ 141.421193] RIP: 0010:[] [] fence_remove_callback+0x11/0x60 [ 141.421229] RSP: 0018:ffffc900016f7d30 EFLAGS: 00010202 [ 141.421250] RAX: ffff8801c049fc00 RBX: ffff8801d4d8dc00 RCX: 0000000000000000 [ 141.421278] RDX: 0000000000000001 RSI: ffff8801c049fcc0 RDI: 0000000000000000 [ 141.421307] RBP: ffffc900016f7d48 R08: 0000000000000000 R09: 0000000000000000 [ 141.421334] R10: 00000020ed512a30 R11: 0000000000000001 R12: 0000000000000000 [ 141.421362] R13: ffff880209ba4ba0 R14: ffff880209ba4c58 R15: ffff8801c055cc60 [ 141.421390] FS: 0000000000000000(0000) GS:ffff88021ef80000(0000) knlGS:0000000000000000 [ 141.421421] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 141.421443] CR2: 0000000000000030 CR3: 000000020b554000 CR4: 00000000003406e0 [ 141.421471] Stack: [ 141.421480] ffff8801d4d8dc00 ffff880209ba4c48 ffff880209ba4ba0 ffffc900016f7d78 [ 141.421513] ffffffffa0697920 ffff880209ba0000 0000000000000000 ffff880209ba2770 [ 141.421549] ffff880209ba4b08 ffffc900016f7df0 ffffffffa05ce2ae ffffffffa0509eb7 [ 141.421583] Call Trace: [ 141.421628] [] amd_sched_hw_job_reset+0x50/0xb0 [amdgpu] [ 141.421676] [] amdgpu_gpu_reset+0x8e/0x690 [amdgpu] [ 141.421712] [] ? drm_printk+0x97/0xa0 [drm] [ 141.421770] [] amdgpu_job_timedout+0x46/0x50 [amdgpu] [ 141.421829] [] amd_sched_job_timedout+0x17/0x20 [amdgpu] [ 141.421859] [] process_one_work+0x153/0x3f0 [ 141.421884] [] worker_thread+0x12b/0x4b0 [ 141.421907] [] ? rescuer_thread+0x350/0x350 [ 141.421931] [] kthread+0xd3/0xf0 [ 141.421951] [] ? kthread_park+0x60/0x60 [ 141.421975] [] ret_from_fork+0x25/0x30 [ 141.421996] Code: ac 81 e8 a3 1f b0 ff 48 c7 c0 ea ff ff ff e9 48 ff ff ff 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 7f 30 48 89 f3 e8 73 7c 26 00 48 8b 13 48 39 d3 41 0f 95 [ 141.422156] RIP [] fence_remove_callback+0x11/0x60 [ 141.422183] RSP [ 141.422197] CR2: 0000000000000030 [ 141.433483] ---[ end trace bc0949bf7ddd6d4b ]--- if the job is reset twice, then the parent could be NULL. Signed-off-by: Chunming Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/scheduler/gpu_scheduler.c') diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index acd882a188bc..5fd12db0fc58 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -387,7 +387,9 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) spin_lock(&sched->job_list_lock); list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { - if (dma_fence_remove_callback(s_job->s_fence->parent, &s_job->s_fence->cb)) { + if (s_job->s_fence->parent && + dma_fence_remove_callback(s_job->s_fence->parent, + &s_job->s_fence->cb)) { dma_fence_put(s_job->s_fence->parent); s_job->s_fence->parent = NULL; } -- cgit v1.2.3 From cb3696fdeca24d3a347a5225d934ce50c6edccba Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Tue, 9 May 2017 15:34:07 +0800 Subject: drm/amd: fix init order of sched job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Need to increment after the fence check. Signed-off-by: Chunming Zhou Reviewed-by: Junwei Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/scheduler/gpu_scheduler.c') diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 5fd12db0fc58..28b92c8d9985 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -462,9 +462,9 @@ int amd_sched_job_init(struct amd_sched_job *job, job->sched = sched; job->s_entity = entity; job->s_fence = amd_sched_fence_create(entity, owner); - job->id = atomic64_inc_return(&sched->job_id_count); if (!job->s_fence) return -ENOMEM; + job->id = atomic64_inc_return(&sched->job_id_count); INIT_WORK(&job->finish_work, amd_sched_job_finish); INIT_LIST_HEAD(&job->node); -- cgit v1.2.3 From 30514decb27d45b98599612cb5d3e6a20ba733a5 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Tue, 9 May 2017 13:39:40 +0800 Subject: drm/amdgpu: fix dependency issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The problem is that executing the jobs in the right order doesn't give you the right result because consecutive jobs executed on the same engine are pipelined. In other words job B does it buffer read before job A has written it's result. Signed-off-by: Chunming Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 17 +++++++++++++++++ drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 ++ 6 files changed, 27 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/scheduler/gpu_scheduler.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 86923c57908b..833c3c16501a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1129,6 +1129,7 @@ struct amdgpu_job { void *owner; uint64_t fence_ctx; /* the fence_context this job uses */ bool vm_needs_flush; + bool need_pipeline_sync; unsigned vm_id; uint64_t vm_pd_addr; uint32_t gds_base, gds_size; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 2d11ac8d1aa9..6e4ae0d983c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -160,6 +160,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, dev_err(adev->dev, "scheduling IB failed (%d).\n", r); return r; } + if (ring->funcs->emit_pipeline_sync && job && job->need_pipeline_sync) + amdgpu_ring_emit_pipeline_sync(ring); if (vm) { r = amdgpu_vm_flush(ring, job); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index c3cfeb335d99..7570f2439a11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -57,6 +57,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, (*job)->vm = vm; (*job)->ibs = (void *)&(*job)[1]; (*job)->num_ibs = num_ibs; + (*job)->need_pipeline_sync = false; amdgpu_sync_create(&(*job)->sync); @@ -152,6 +153,9 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job) fence = amdgpu_sync_get_fence(&job->sync); } + if (amd_sched_dependency_optimized(fence, sched_job->s_entity)) + job->need_pipeline_sync = true; + return fence; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index c42a9979d056..07ff3b1514f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -614,7 +614,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) if (ring->funcs->init_cond_exec) patch_offset = amdgpu_ring_init_cond_exec(ring); - if (ring->funcs->emit_pipeline_sync) + if (ring->funcs->emit_pipeline_sync && !job->need_pipeline_sync) amdgpu_ring_emit_pipeline_sync(ring); if (ring->funcs->emit_vm_flush && vm_flush_needed) { diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 28b92c8d9985..fea96a765cf1 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -236,6 +236,23 @@ static void amd_sched_entity_clear_dep(struct dma_fence *f, struct dma_fence_cb dma_fence_put(f); } +bool amd_sched_dependency_optimized(struct dma_fence* fence, + struct amd_sched_entity *entity) +{ + struct amd_gpu_scheduler *sched = entity->sched; + struct amd_sched_fence *s_fence; + + if (!fence || dma_fence_is_signaled(fence)) + return false; + if (fence->context == entity->fence_context) + return true; + s_fence = to_amd_sched_fence(fence); + if (s_fence && s_fence->sched == sched) + return true; + + return false; +} + static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity) { struct amd_gpu_scheduler *sched = entity->sched; diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index 0255c7f8a6d8..924d4a5899e1 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -158,4 +158,6 @@ int amd_sched_job_init(struct amd_sched_job *job, void *owner); void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched); void amd_sched_job_recovery(struct amd_gpu_scheduler *sched); +bool amd_sched_dependency_optimized(struct dma_fence* fence, + struct amd_sched_entity *entity); #endif -- cgit v1.2.3