summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/common/device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r--drivers/accel/habanalabs/common/device.c367
1 files changed, 234 insertions, 133 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 9933e5858a36..fabfc501ef54 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -22,7 +22,6 @@
enum dma_alloc_type {
DMA_ALLOC_COHERENT,
- DMA_ALLOC_CPU_ACCESSIBLE,
DMA_ALLOC_POOL,
};
@@ -121,9 +120,6 @@ static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t
case DMA_ALLOC_COHERENT:
ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag);
break;
- case DMA_ALLOC_CPU_ACCESSIBLE:
- ptr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
- break;
case DMA_ALLOC_POOL:
ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle);
break;
@@ -147,9 +143,6 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c
case DMA_ALLOC_COHERENT:
hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle);
break;
- case DMA_ALLOC_CPU_ACCESSIBLE:
- hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, cpu_addr);
- break;
case DMA_ALLOC_POOL:
hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle);
break;
@@ -170,18 +163,6 @@ void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void
hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller);
}
-void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size,
- dma_addr_t *dma_handle, const char *caller)
-{
- return hl_dma_alloc_common(hdev, size, dma_handle, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller);
-}
-
-void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr,
- const char *caller)
-{
- hl_asic_dma_free_common(hdev, size, vaddr, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller);
-}
-
void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags,
dma_addr_t *dma_handle, const char *caller)
{
@@ -194,6 +175,16 @@ void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_
hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller);
}
+void *hl_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle)
+{
+ return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
+}
+
+void hl_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr)
+{
+ hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr);
+}
+
int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -389,18 +380,17 @@ bool hl_ctrl_device_operational(struct hl_device *hdev,
static void print_idle_status_mask(struct hl_device *hdev, const char *message,
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
{
- u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {};
-
- BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4);
-
- pad_width[3] = idle_mask[3] ? 16 : 0;
- pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0;
- pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0;
- pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0;
-
- dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n",
- message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2],
- pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]);
+ if (idle_mask[3])
+ dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx_%016llx)\n",
+ message, idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
+ else if (idle_mask[2])
+ dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx)\n",
+ message, idle_mask[2], idle_mask[1], idle_mask[0]);
+ else if (idle_mask[1])
+ dev_err(hdev->dev, "%s (mask %#llx_%016llx)\n",
+ message, idle_mask[1], idle_mask[0]);
+ else
+ dev_err(hdev->dev, "%s (mask %#llx)\n", message, idle_mask[0]);
}
static void hpriv_release(struct kref *ref)
@@ -423,6 +413,9 @@ static void hpriv_release(struct kref *ref)
mutex_destroy(&hpriv->ctx_lock);
mutex_destroy(&hpriv->restore_phase_mutex);
+ /* There should be no memory buffers at this point and handles IDR can be destroyed */
+ hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
+
/* Device should be reset if reset-upon-device-release is enabled, or if there is a pending
* reset that waits for device release.
*/
@@ -492,6 +485,36 @@ int hl_hpriv_put(struct hl_fpriv *hpriv)
return kref_put(&hpriv->refcount, hpriv_release);
}
+static void print_device_in_use_info(struct hl_device *hdev, const char *message)
+{
+ u32 active_cs_num, dmabuf_export_cnt;
+ bool unknown_reason = true;
+ char buf[128];
+ size_t size;
+ int offset;
+
+ size = sizeof(buf);
+ offset = 0;
+
+ active_cs_num = hl_get_active_cs_num(hdev);
+ if (active_cs_num) {
+ unknown_reason = false;
+ offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num);
+ }
+
+ dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt);
+ if (dmabuf_export_cnt) {
+ unknown_reason = false;
+ offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]",
+ dmabuf_export_cnt);
+ }
+
+ if (unknown_reason)
+ scnprintf(buf + offset, size - offset, " [unknown reason]");
+
+ dev_notice(hdev->dev, "%s%s\n", message, buf);
+}
+
/*
* hl_device_release - release function for habanalabs device
*
@@ -514,17 +537,20 @@ static int hl_device_release(struct inode *inode, struct file *filp)
}
hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
+
+ /* Memory buffers might be still in use at this point and thus the handles IDR destruction
+ * is postponed to hpriv_release().
+ */
hl_mem_mgr_fini(&hpriv->mem_mgr);
hdev->compute_ctx_in_release = 1;
if (!hl_hpriv_put(hpriv)) {
- dev_notice(hdev->dev, "User process closed FD but device still in use\n");
+ print_device_in_use_info(hdev, "User process closed FD but device still in use");
hl_device_reset(hdev, HL_DRV_RESET_HARD);
}
- hdev->last_open_session_duration_jif =
- jiffies - hdev->last_successful_open_jif;
+ hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif;
return 0;
}
@@ -617,7 +643,7 @@ static void device_release_func(struct device *dev)
* device_init_cdev - Initialize cdev and device for habanalabs device
*
* @hdev: pointer to habanalabs device structure
- * @hclass: pointer to the class object of the device
+ * @class: pointer to the class object of the device
* @minor: minor number of the specific device
* @fpos: file operations to install for this device
* @name: name of the device as it will appear in the filesystem
@@ -626,7 +652,7 @@ static void device_release_func(struct device *dev)
*
* Initialize a cdev and a Linux device for habanalabs's device.
*/
-static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
+static int device_init_cdev(struct hl_device *hdev, struct class *class,
int minor, const struct file_operations *fops,
char *name, struct cdev *cdev,
struct device **dev)
@@ -640,7 +666,7 @@ static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
device_initialize(*dev);
(*dev)->devt = MKDEV(hdev->major, minor);
- (*dev)->class = hclass;
+ (*dev)->class = class;
(*dev)->release = device_release_func;
dev_set_drvdata(*dev, hdev);
dev_set_name(*dev, "%s", name);
@@ -733,14 +759,14 @@ static void device_hard_reset_pending(struct work_struct *work)
static void device_release_watchdog_func(struct work_struct *work)
{
- struct hl_device_reset_work *device_release_watchdog_work =
- container_of(work, struct hl_device_reset_work, reset_work.work);
- struct hl_device *hdev = device_release_watchdog_work->hdev;
+ struct hl_device_reset_work *watchdog_work =
+ container_of(work, struct hl_device_reset_work, reset_work.work);
+ struct hl_device *hdev = watchdog_work->hdev;
u32 flags;
- dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n");
+ dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n");
- flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR;
+ flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR;
hl_device_reset(hdev, flags);
}
@@ -805,7 +831,7 @@ static int device_early_init(struct hl_device *hdev)
}
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
- snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
+ snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
if (hdev->cq_wq[i] == NULL) {
dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@@ -814,14 +840,16 @@ static int device_early_init(struct hl_device *hdev)
}
}
- hdev->eq_wq = create_singlethread_workqueue("hl-events");
+ snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
+ hdev->eq_wq = create_singlethread_workqueue(workq_name);
if (hdev->eq_wq == NULL) {
dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
rc = -ENOMEM;
goto free_cq_wq;
}
- hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0);
+ snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
+ hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->cs_cmplt_wq) {
dev_err(hdev->dev,
"Failed to allocate CS completions workqueue\n");
@@ -829,7 +857,8 @@ static int device_early_init(struct hl_device *hdev)
goto free_eq_wq;
}
- hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0);
+ snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
+ hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->ts_free_obj_wq) {
dev_err(hdev->dev,
"Failed to allocate Timestamp registration free workqueue\n");
@@ -837,15 +866,15 @@ static int device_early_init(struct hl_device *hdev)
goto free_cs_cmplt_wq;
}
- hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
+ snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
+ hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->prefetch_wq) {
dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
rc = -ENOMEM;
goto free_ts_free_wq;
}
- hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
- GFP_KERNEL);
+ hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
if (!hdev->hl_chip_info) {
rc = -ENOMEM;
goto free_prefetch_wq;
@@ -857,7 +886,8 @@ static int device_early_init(struct hl_device *hdev)
hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
- hdev->reset_wq = create_singlethread_workqueue("hl_device_reset");
+ snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
+ hdev->reset_wq = create_singlethread_workqueue(workq_name);
if (!hdev->reset_wq) {
rc = -ENOMEM;
dev_err(hdev->dev, "Failed to create device reset WQ\n");
@@ -887,6 +917,7 @@ static int device_early_init(struct hl_device *hdev)
free_cb_mgr:
hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
+ hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
free_chip_info:
kfree(hdev->hl_chip_info);
free_prefetch_wq:
@@ -930,6 +961,7 @@ static void device_early_fini(struct hl_device *hdev)
mutex_destroy(&hdev->clk_throttling.lock);
hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
+ hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
kfree(hdev->hl_chip_info);
@@ -953,6 +985,8 @@ static void hl_device_heartbeat(struct work_struct *work)
{
struct hl_device *hdev = container_of(work, struct hl_device,
work_heartbeat.work);
+ struct hl_info_fw_err_info info = {0};
+ u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
if (!hl_device_operational(hdev, NULL))
goto reschedule;
@@ -963,7 +997,10 @@ static void hl_device_heartbeat(struct work_struct *work)
if (hl_device_operational(hdev, NULL))
dev_err(hdev->dev, "Device heartbeat failed!\n");
- hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
+ info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
+ info.event_mask = &event_mask;
+ hl_handle_fw_err(hdev, &info);
+ hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask);
return;
@@ -1234,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev)
return 0;
disable_device:
- pci_clear_master(hdev->pdev);
pci_disable_device(hdev->pdev);
return rc;
@@ -1344,6 +1380,34 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
mutex_unlock(fd_lock);
}
+static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
+{
+ /* If reset is due to heartbeat, device CPU is no responsive in
+ * which case no point sending PCI disable message to it.
+ */
+ if ((flags & HL_DRV_RESET_HARD) &&
+ !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
+ /* Disable PCI access from device F/W so he won't send
+ * us additional interrupts. We disable MSI/MSI-X at
+ * the halt_engines function and we can't have the F/W
+ * sending us interrupts after that. We need to disable
+ * the access here because if the device is marked
+ * disable, the message won't be send. Also, in case
+ * of heartbeat, the device CPU is marked as disable
+ * so this message won't be sent
+ */
+ if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
+ dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+ return;
+ }
+
+ /* verify that last EQs are handled before disabled is set */
+ if (hdev->cpu_queues_enable)
+ synchronize_irq(pci_irq_vector(hdev->pdev,
+ hdev->asic_prop.eq_interrupt_id));
+ }
+}
+
static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
{
u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
@@ -1382,28 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
} else {
hdev->reset_info.reset_trigger_repeated = 1;
}
-
- /* If reset is due to heartbeat, device CPU is no responsive in
- * which case no point sending PCI disable message to it.
- *
- * If F/W is performing the reset, no need to send it a message to disable
- * PCI access
- */
- if ((flags & HL_DRV_RESET_HARD) &&
- !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
- /* Disable PCI access from device F/W so he won't send
- * us additional interrupts. We disable MSI/MSI-X at
- * the halt_engines function and we can't have the F/W
- * sending us interrupts after that. We need to disable
- * the access here because if the device is marked
- * disable, the message won't be send. Also, in case
- * of heartbeat, the device CPU is marked as disable
- * so this message won't be sent
- */
- if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
- dev_warn(hdev->dev,
- "Failed to disable PCI access by F/W\n");
- }
}
/*
@@ -1424,12 +1466,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
*/
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
- bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
- reset_upon_device_release = false, schedule_hard_reset = false,
- delay_reset, from_dev_release, from_watchdog_thread;
+ bool hard_reset, from_hard_reset_thread, fw_reset, reset_upon_device_release,
+ schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx;
- int i, rc;
+ int i, rc, hw_fini_rc;
if (!hdev->init_done) {
dev_err(hdev->dev, "Can't reset before initialization is done\n");
@@ -1442,6 +1483,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE);
delay_reset = !!(flags & HL_DRV_RESET_DELAY);
from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
+ reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
@@ -1449,30 +1491,26 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
}
if (!hard_reset && !hdev->asic_prop.supports_compute_reset) {
- hard_instead_soft = true;
+ dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n");
hard_reset = true;
}
- if (hdev->reset_upon_device_release && from_dev_release) {
+ if (reset_upon_device_release) {
if (hard_reset) {
dev_crit(hdev->dev,
"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
return -EINVAL;
}
- reset_upon_device_release = true;
-
goto do_reset;
}
if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
- hard_instead_soft = true;
+ dev_dbg(hdev->dev,
+ "asic doesn't allow inference soft reset - do hard-reset instead\n");
hard_reset = true;
}
- if (hard_instead_soft)
- dev_dbg(hdev->dev, "Doing hard-reset instead of compute reset\n");
-
do_reset:
/* Re-entry of reset thread */
if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
@@ -1480,14 +1518,14 @@ do_reset:
/*
* Prevent concurrency in this function - only one reset should be
- * done at any given time. Only need to perform this if we didn't
- * get from the dedicated hard reset thread
+ * done at any given time. We need to perform this only if we didn't
+ * get here from a dedicated hard reset thread.
*/
if (!from_hard_reset_thread) {
/* Block future CS/VM/JOB completion operations */
spin_lock(&hdev->reset_info.lock);
if (hdev->reset_info.in_reset) {
- /* We only allow scheduling of a hard reset during compute reset */
+ /* We allow scheduling of a hard reset only during a compute reset */
if (hard_reset && hdev->reset_info.in_compute_reset)
hdev->reset_info.hard_reset_schedule_flags = flags;
spin_unlock(&hdev->reset_info.lock);
@@ -1505,15 +1543,17 @@ do_reset:
/* Cancel the device release watchdog work if required.
* In case of reset-upon-device-release while the release watchdog work is
- * scheduled, do hard-reset instead of compute-reset.
+ * scheduled due to a hard-reset, do hard-reset instead of compute-reset.
*/
if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) {
+ struct hl_device_reset_work *watchdog_work =
+ &hdev->device_release_watchdog_work;
+
hdev->reset_info.watchdog_active = 0;
if (!from_watchdog_thread)
- cancel_delayed_work_sync(
- &hdev->device_release_watchdog_work.reset_work);
+ cancel_delayed_work_sync(&watchdog_work->reset_work);
- if (from_dev_release) {
+ if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) {
hdev->reset_info.in_compute_reset = 0;
flags |= HL_DRV_RESET_HARD;
flags &= ~HL_DRV_RESET_DEV_RELEASE;
@@ -1524,7 +1564,9 @@ do_reset:
if (delay_reset)
usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
+escalate_reset_flow:
handle_reset_trigger(hdev, flags);
+ send_disable_pci_access(hdev, flags);
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
@@ -1539,7 +1581,6 @@ do_reset:
dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
}
-again:
if ((hard_reset) && (!from_hard_reset_thread)) {
hdev->reset_info.hard_reset_pending = true;
@@ -1592,7 +1633,7 @@ kill_processes:
}
/* Reset the H/W. It will be in idle state after this returns */
- hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
+ hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
if (hard_reset) {
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@@ -1619,6 +1660,10 @@ kill_processes:
hl_ctx_put(ctx);
}
+ if (hw_fini_rc) {
+ rc = hw_fini_rc;
+ goto out_err;
+ }
/* Finished tear-down, starting to re-initialize */
if (hard_reset) {
@@ -1784,10 +1829,8 @@ kill_processes:
dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
flags = hdev->reset_info.hard_reset_schedule_flags;
hdev->reset_info.hard_reset_schedule_flags = 0;
- hdev->disabled = true;
hard_reset = true;
- handle_reset_trigger(hdev, flags);
- goto again;
+ goto escalate_reset_flow;
}
}
@@ -1804,20 +1847,19 @@ out_err:
"%s Failed to reset! Device is NOT usable\n",
dev_name(&(hdev)->pdev->dev));
hdev->reset_info.hard_reset_cnt++;
- } else if (reset_upon_device_release) {
- spin_unlock(&hdev->reset_info.lock);
- dev_err(hdev->dev, "Failed to reset device after user release\n");
- flags |= HL_DRV_RESET_HARD;
- flags &= ~HL_DRV_RESET_DEV_RELEASE;
- hard_reset = true;
- goto again;
} else {
+ if (reset_upon_device_release) {
+ dev_err(hdev->dev, "Failed to reset device after user release\n");
+ flags &= ~HL_DRV_RESET_DEV_RELEASE;
+ } else {
+ dev_err(hdev->dev, "Failed to do compute reset\n");
+ hdev->reset_info.compute_reset_cnt++;
+ }
+
spin_unlock(&hdev->reset_info.lock);
- dev_err(hdev->dev, "Failed to do compute reset\n");
- hdev->reset_info.compute_reset_cnt++;
flags |= HL_DRV_RESET_HARD;
hard_reset = true;
- goto again;
+ goto escalate_reset_flow;
}
hdev->reset_info.in_reset = 0;
@@ -1840,10 +1882,6 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
{
struct hl_ctx *ctx = NULL;
- /* Device release watchdog is only for hard reset */
- if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset)
- goto device_reset;
-
/* F/W reset cannot be postponed */
if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW)
goto device_reset;
@@ -1871,7 +1909,7 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
goto out;
hdev->device_release_watchdog_work.flags = flags;
- dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n",
+ dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n",
hdev->device_release_watchdog_timeout_sec);
schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work,
msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000));
@@ -1939,37 +1977,27 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
mutex_unlock(&hdev->fpriv_ctrl_list_lock);
}
-/*
- * hl_device_init - main initialization function for habanalabs device
- *
- * @hdev: pointer to habanalabs device structure
- *
- * Allocate an id for the device, do early initialization and then call the
- * ASIC specific initialization functions. Finally, create the cdev and the
- * Linux device to expose it to the user
- */
-int hl_device_init(struct hl_device *hdev, struct class *hclass)
+static int create_cdev(struct hl_device *hdev)
{
- int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
char *name;
- bool add_cdev_sysfs_on_err = false;
+ int rc;
hdev->cdev_idx = hdev->id / 2;
name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx);
if (!name) {
rc = -ENOMEM;
- goto out_disabled;
+ goto out_err;
}
/* Initialize cdev and device structures */
- rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
+ rc = device_init_cdev(hdev, hdev->hclass, hdev->id, &hl_ops, name,
&hdev->cdev, &hdev->dev);
kfree(name);
if (rc)
- goto out_disabled;
+ goto out_err;
name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx);
if (!name) {
@@ -1978,7 +2006,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
}
/* Initialize cdev and device structures for control device */
- rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
+ rc = device_init_cdev(hdev, hdev->hclass, hdev->id_control, &hl_ctrl_ops,
name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
kfree(name);
@@ -1986,10 +2014,36 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
if (rc)
goto free_dev;
+ return 0;
+
+free_dev:
+ put_device(hdev->dev);
+out_err:
+ return rc;
+}
+
+/*
+ * hl_device_init - main initialization function for habanalabs device
+ *
+ * @hdev: pointer to habanalabs device structure
+ *
+ * Allocate an id for the device, do early initialization and then call the
+ * ASIC specific initialization functions. Finally, create the cdev and the
+ * Linux device to expose it to the user
+ */
+int hl_device_init(struct hl_device *hdev)
+{
+ int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
+ bool add_cdev_sysfs_on_err = false;
+
+ rc = create_cdev(hdev);
+ if (rc)
+ goto out_disabled;
+
/* Initialize ASIC function pointers and perform early init */
rc = device_early_init(hdev);
if (rc)
- goto free_dev_ctrl;
+ goto free_dev;
user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
hdev->asic_prop.user_interrupt_count;
@@ -2241,9 +2295,8 @@ free_usr_intr_mem:
kfree(hdev->user_interrupt);
early_fini:
device_early_fini(hdev);
-free_dev_ctrl:
- put_device(hdev->dev_ctrl);
free_dev:
+ put_device(hdev->dev_ctrl);
put_device(hdev->dev);
out_disabled:
hdev->disabled = true;
@@ -2364,7 +2417,9 @@ void hl_device_fini(struct hl_device *hdev)
hl_cb_pool_fini(hdev);
/* Reset the H/W. It will be in idle state after this returns */
- hdev->asic_funcs->hw_fini(hdev, true, false);
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc)
+ dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@@ -2566,3 +2621,49 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
if (event_mask)
*event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT;
}
+
+static void hl_capture_hw_err(struct hl_device *hdev, u16 event_id)
+{
+ struct hw_err_info *info = &hdev->captured_err_info.hw_err;
+
+ /* Capture only the first HW err */
+ if (atomic_cmpxchg(&info->event_detected, 0, 1))
+ return;
+
+ info->event.timestamp = ktime_to_ns(ktime_get());
+ info->event.event_id = event_id;
+
+ info->event_info_available = true;
+}
+
+void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask)
+{
+ hl_capture_hw_err(hdev, event_id);
+
+ if (event_mask)
+ *event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR;
+}
+
+static void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info)
+{
+ struct fw_err_info *info = &hdev->captured_err_info.fw_err;
+
+ /* Capture only the first FW error */
+ if (atomic_cmpxchg(&info->event_detected, 0, 1))
+ return;
+
+ info->event.timestamp = ktime_to_ns(ktime_get());
+ info->event.err_type = fw_info->err_type;
+ if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
+ info->event.event_id = fw_info->event_id;
+
+ info->event_info_available = true;
+}
+
+void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
+{
+ hl_capture_fw_err(hdev, info);
+
+ if (info->event_mask)
+ *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
+}