summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/gaudi/gaudi.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/accel/habanalabs/gaudi/gaudi.c')
-rw-r--r--drivers/accel/habanalabs/gaudi/gaudi.c151
1 files changed, 52 insertions, 99 deletions
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index bb858b94e1e8..a29aa8f7b6f3 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -656,6 +656,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
prop->cfg_size = CFG_SIZE;
prop->max_asid = MAX_ASID;
prop->num_of_events = GAUDI_EVENT_SIZE;
+ prop->max_num_of_engines = GAUDI_ENGINE_ID_SIZE;
prop->tpc_enabled_mask = TPC_ENABLED_MASK;
set_default_power_values(hdev);
@@ -679,6 +680,10 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
(num_sync_stream_queues * HL_RSVD_MONS);
prop->first_available_user_interrupt = USHRT_MAX;
+ prop->tpc_interrupt_id = USHRT_MAX;
+
+ /* single msi */
+ prop->eq_interrupt_id = 0;
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;
@@ -867,13 +872,18 @@ pci_init:
rc = hl_fw_read_preboot_status(hdev);
if (rc) {
if (hdev->reset_on_preboot_fail)
+ /* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini;
}
if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
- hdev->asic_funcs->hw_fini(hdev, true, false);
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc) {
+ dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+ goto pci_fini;
+ }
}
return 0;
@@ -2010,38 +2020,6 @@ static int gaudi_enable_msi_single(struct hl_device *hdev)
return rc;
}
-static int gaudi_enable_msi_multi(struct hl_device *hdev)
-{
- int cq_cnt = hdev->asic_prop.completion_queues_count;
- int rc, i, irq_cnt_init, irq;
-
- for (i = 0, irq_cnt_init = 0 ; i < cq_cnt ; i++, irq_cnt_init++) {
- irq = gaudi_pci_irq_vector(hdev, i, false);
- rc = request_irq(irq, hl_irq_handler_cq, 0, gaudi_irq_name[i],
- &hdev->completion_queue[i]);
- if (rc) {
- dev_err(hdev->dev, "Failed to request IRQ %d", irq);
- goto free_irqs;
- }
- }
-
- irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, true);
- rc = request_irq(irq, hl_irq_handler_eq, 0, gaudi_irq_name[cq_cnt],
- &hdev->event_queue);
- if (rc) {
- dev_err(hdev->dev, "Failed to request IRQ %d", irq);
- goto free_irqs;
- }
-
- return 0;
-
-free_irqs:
- for (i = 0 ; i < irq_cnt_init ; i++)
- free_irq(gaudi_pci_irq_vector(hdev, i, false),
- &hdev->completion_queue[i]);
- return rc;
-}
-
static int gaudi_enable_msi(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -2056,14 +2034,7 @@ static int gaudi_enable_msi(struct hl_device *hdev)
return rc;
}
- if (rc < NUMBER_OF_INTERRUPTS) {
- gaudi->multi_msi_mode = false;
- rc = gaudi_enable_msi_single(hdev);
- } else {
- gaudi->multi_msi_mode = true;
- rc = gaudi_enable_msi_multi(hdev);
- }
-
+ rc = gaudi_enable_msi_single(hdev);
if (rc)
goto free_pci_irq_vectors;
@@ -2079,47 +2050,23 @@ free_pci_irq_vectors:
static void gaudi_sync_irqs(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- int i, cq_cnt = hdev->asic_prop.completion_queues_count;
if (!(gaudi->hw_cap_initialized & HW_CAP_MSI))
return;
/* Wait for all pending IRQs to be finished */
- if (gaudi->multi_msi_mode) {
- for (i = 0 ; i < cq_cnt ; i++)
- synchronize_irq(gaudi_pci_irq_vector(hdev, i, false));
-
- synchronize_irq(gaudi_pci_irq_vector(hdev,
- GAUDI_EVENT_QUEUE_MSI_IDX,
- true));
- } else {
- synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false));
- }
+ synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false));
}
static void gaudi_disable_msi(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- int i, irq, cq_cnt = hdev->asic_prop.completion_queues_count;
if (!(gaudi->hw_cap_initialized & HW_CAP_MSI))
return;
gaudi_sync_irqs(hdev);
-
- if (gaudi->multi_msi_mode) {
- irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX,
- true);
- free_irq(irq, &hdev->event_queue);
-
- for (i = 0 ; i < cq_cnt ; i++) {
- irq = gaudi_pci_irq_vector(hdev, i, false);
- free_irq(irq, &hdev->completion_queue[i]);
- }
- } else {
- free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev);
- }
-
+ free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev);
pci_free_irq_vectors(hdev->pdev);
gaudi->hw_cap_initialized &= ~HW_CAP_MSI;
@@ -3718,7 +3665,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
if (rc) {
dev_err(hdev->dev,
"failed to set hop0 addr for asid %d\n", i);
- goto err;
+ return rc;
}
}
@@ -3729,7 +3676,9 @@ static int gaudi_mmu_init(struct hl_device *hdev)
/* mem cache invalidation */
WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1);
- hl_mmu_invalidate_cache(hdev, true, 0);
+ rc = hl_mmu_invalidate_cache(hdev, true, 0);
+ if (rc)
+ return rc;
WREG32(mmMMU_UP_MMU_ENABLE, 1);
WREG32(mmMMU_UP_SPI_MASK, 0xF);
@@ -3745,9 +3694,6 @@ static int gaudi_mmu_init(struct hl_device *hdev)
gaudi->hw_cap_initialized |= HW_CAP_MMU;
return 0;
-
-err:
- return rc;
}
static int gaudi_load_firmware_to_device(struct hl_device *hdev)
@@ -3915,11 +3861,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
WREG32(mmCPU_IF_PF_PQ_PI, 0);
- if (gaudi->multi_msi_mode)
- WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP);
- else
- WREG32(mmCPU_IF_QUEUE_INIT,
- PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI);
+ WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI);
irq_handler_offset = prop->gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
@@ -4068,7 +4010,7 @@ disable_queues:
return rc;
}
-static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
+static int gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{
struct cpu_dyn_regs *dyn_regs =
&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@@ -4078,7 +4020,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
if (!hard_reset) {
dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n");
- return;
+ return 0;
}
if (hdev->pldm) {
@@ -4199,10 +4141,10 @@ skip_reset:
msleep(reset_timeout_ms);
status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM);
- if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK)
- dev_err(hdev->dev,
- "Timeout while waiting for device to reset 0x%x\n",
- status);
+ if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) {
+ dev_err(hdev->dev, "Timeout while waiting for device to reset 0x%x\n", status);
+ return -ETIMEDOUT;
+ }
if (gaudi) {
gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | HW_CAP_HBM |
@@ -4215,6 +4157,7 @@ skip_reset:
hdev->device_cpu_is_halted = false;
}
+ return 0;
}
static int gaudi_suspend(struct hl_device *hdev)
@@ -5595,7 +5538,6 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add
u32 len, u32 original_len, u64 cq_addr, u32 cq_val,
u32 msi_vec, bool eb)
{
- struct gaudi_device *gaudi = hdev->asic_specific;
struct packet_msg_prot *cq_pkt;
struct packet_nop *cq_padding;
u64 msi_addr;
@@ -5625,12 +5567,7 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add
tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
cq_pkt->ctl = cpu_to_le32(tmp);
cq_pkt->value = cpu_to_le32(1);
-
- if (gaudi->multi_msi_mode)
- msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4;
- else
- msi_addr = mmPCIE_CORE_MSI_REQ;
-
+ msi_addr = hdev->pdev ? mmPCIE_CORE_MSI_REQ : mmPCIE_MSI_INTR_0 + msi_vec * 4;
cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr);
}
@@ -7297,7 +7234,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
}
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
- bool razwi, u64 *event_mask)
+ bool check_razwi, u64 *event_mask)
{
bool is_read = false, is_write = false;
u16 engine_id[2], num_of_razwi_eng = 0;
@@ -7316,7 +7253,7 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
- if (razwi) {
+ if (check_razwi) {
gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
&is_write);
gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask);
@@ -7333,8 +7270,9 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
num_of_razwi_eng = 1;
}
- hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags,
- event_mask);
+ if (razwi_flags)
+ hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng,
+ razwi_flags, event_mask);
}
}
@@ -7633,6 +7571,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type,
static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{
struct gaudi_device *gaudi = hdev->asic_specific;
+ struct hl_info_fw_err_info fw_err_info;
u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0;
u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
u32 fw_fatal_err_flag = 0, flags = 0;
@@ -7911,7 +7850,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_FW_ALIVE_S:
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
- event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+ fw_err_info.err_type = HL_INFO_FW_REPORTED_ERR;
+ fw_err_info.event_id = event_type;
+ fw_err_info.event_mask = &event_mask;
+ hl_handle_fw_err(hdev, &fw_err_info);
goto reset_device;
default:
@@ -7942,6 +7884,10 @@ reset_device:
}
if (reset_required) {
+ /* escalate general hw errors to critical/fatal error */
+ if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
+ hl_handle_critical_hw_err(hdev, event_type, &event_mask);
+
hl_device_cond_reset(hdev, flags, event_mask);
} else {
hl_fw_unmask_irq(hdev, event_type);
@@ -8403,19 +8349,26 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev,
}
mutex_lock(&hdev->mmu_lock);
+
rc = hl_mmu_map_contiguous(ctx, hdev->internal_cb_va_base,
hdev->internal_cb_pool_dma_addr,
HOST_SPACE_INTERNAL_CB_SZ);
-
- hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
- mutex_unlock(&hdev->mmu_lock);
-
if (rc)
goto unreserve_internal_cb_pool;
+ rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
+ if (rc)
+ goto unmap_internal_cb_pool;
+
+ mutex_unlock(&hdev->mmu_lock);
+
return 0;
+unmap_internal_cb_pool:
+ hl_mmu_unmap_contiguous(ctx, hdev->internal_cb_va_base,
+ HOST_SPACE_INTERNAL_CB_SZ);
unreserve_internal_cb_pool:
+ mutex_unlock(&hdev->mmu_lock);
hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base,
HOST_SPACE_INTERNAL_CB_SZ);
destroy_internal_cb_pool: