From 5b557298d7d09cce04e0565a535fbca63661724a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:27:27 +0200 Subject: misc: at25: Make driver OF independent again The commit f60e7074902a ("misc: at25: Make use of device property API") made a good job by enabling the driver for non-OF platforms, but the recent commit 604288bc6196 ("nvmem: eeprom: at25: fix type compiler warnings") brought that back. Restore greatness of the driver once again. Fixes: eab61fb1cc2e ("nvmem: eeprom: at25: fram discovery simplification") Fixes: fd307a4ad332 ("nvmem: prepare basics for FRAM support") Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125212729.86585-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 632325474233..57599eac2f71 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include /* @@ -381,13 +379,14 @@ static int at25_probe(struct spi_device *spi) int sr; u8 id[FM25_ID_LEN]; u8 sernum[FM25_SN_LEN]; + bool is_fram; int i; - const struct of_device_id *match; - bool is_fram = 0; - match = of_match_device(of_match_ptr(at25_of_match), &spi->dev); - if (match && !strcmp(match->compatible, "cypress,fm25")) - is_fram = 1; + err = device_property_match_string(&spi->dev, "compatible", "cypress,fm25"); + if (err >= 0) + is_fram = true; + else + is_fram = false; /* Chip description */ if (!spi->dev.platform_data) { -- cgit v1.2.3 From a692fc39bf90913f3cea57ee240ea5d6338da235 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:27:28 +0200 Subject: misc: at25: Don't copy garbage to the at25->chip in FRAM case Even if we know that we are going to fill everything later on it's bad style and fragile to copy garbage from the stack to the data structure that will be used in the driver. Fixes: fd307a4ad332 ("nvmem: prepare basics for FRAM support") Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125212729.86585-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 57599eac2f71..f0b0efc30ee6 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -390,7 +390,10 @@ static int at25_probe(struct spi_device *spi) /* Chip description */ if (!spi->dev.platform_data) { - if (!is_fram) { + if (is_fram) { + /* We file fields for FRAM case later on */ + memset(&chip, 0, sizeof(chip)); + } else { err = at25_fw_to_chip(&spi->dev, &chip); if (err) return err; -- cgit v1.2.3 From 58589a75bba96f43b62d8069b35be081bc00d7c3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:27:29 +0200 Subject: misc: at25: Check proper value of chip length in FRAM case Obviously the byte_len value should be checked from the chip and not from at25->chip. Fixes: fd307a4ad332 ("nvmem: prepare basics for FRAM support") Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125212729.86585-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index f0b0efc30ee6..e21216541b0f 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -433,9 +433,9 @@ static int at25_probe(struct spi_device *spi) dev_err(&spi->dev, "Error: unsupported size (id %02x)\n", id[7]); return -ENODEV; } - chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024; - if (at25->chip.byte_len > 64 * 1024) + chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024; + if (chip.byte_len > 64 * 1024) at25->chip.flags |= EE_ADDR3; else at25->chip.flags |= EE_ADDR2; -- cgit v1.2.3 From 51902c1212feb9652826fd978e5c58b683f865db Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:31:54 +0200 Subject: misc: at25: Use at25->chip instead of local chip everywhere in ->probe() Currently some values are compared against the contents of the chip structure and most are from its updated copy in at25->chip. Use the latter one everywhere in ->probe(). Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index e21216541b0f..6bea9c7c64a0 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -434,8 +434,8 @@ static int at25_probe(struct spi_device *spi) return -ENODEV; } - chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024; - if (chip.byte_len > 64 * 1024) + at25->chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024; + if (at25->chip.byte_len > 64 * 1024) at25->chip.flags |= EE_ADDR3; else at25->chip.flags |= EE_ADDR2; @@ -466,7 +466,7 @@ static int at25_probe(struct spi_device *spi) at25->nvmem_config.type = is_fram ? NVMEM_TYPE_FRAM : NVMEM_TYPE_EEPROM; at25->nvmem_config.name = dev_name(&spi->dev); at25->nvmem_config.dev = &spi->dev; - at25->nvmem_config.read_only = chip.flags & EE_READONLY; + at25->nvmem_config.read_only = at25->chip.flags & EE_READONLY; at25->nvmem_config.root_only = true; at25->nvmem_config.owner = THIS_MODULE; at25->nvmem_config.compat = true; @@ -476,17 +476,17 @@ static int at25_probe(struct spi_device *spi) at25->nvmem_config.priv = at25; at25->nvmem_config.stride = 1; at25->nvmem_config.word_size = 1; - at25->nvmem_config.size = chip.byte_len; + at25->nvmem_config.size = at25->chip.byte_len; at25->nvmem = devm_nvmem_register(&spi->dev, &at25->nvmem_config); if (IS_ERR(at25->nvmem)) return PTR_ERR(at25->nvmem); dev_info(&spi->dev, "%d %s %s %s%s, pagesize %u\n", - (chip.byte_len < 1024) ? chip.byte_len : (chip.byte_len / 1024), - (chip.byte_len < 1024) ? "Byte" : "KByte", + (at25->chip.byte_len < 1024) ? at25->chip.byte_len : (at25->chip.byte_len / 1024), + (at25->chip.byte_len < 1024) ? "Byte" : "KByte", at25->chip.name, is_fram ? "fram" : "eeprom", - (chip.flags & EE_READONLY) ? " (readonly)" : "", + (at25->chip.flags & EE_READONLY) ? " (readonly)" : "", at25->chip.page_size); return 0; } -- cgit v1.2.3 From c329fe53474ac424cd5eb77c2b6b1fb3fc136d7b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:31:55 +0200 Subject: misc: at25: Unshadow error codes in at25_fw_to_chip() device_property_read_u32() may return different error codes. Unshadow them in the at25_fw_to_chip() to give better error report. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 6bea9c7c64a0..027840c73fc8 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -304,33 +304,35 @@ static int at25_ee_write(void *priv, unsigned int off, void *val, size_t count) static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip) { u32 val; + int err; memset(chip, 0, sizeof(*chip)); strncpy(chip->name, "at25", sizeof(chip->name)); - if (device_property_read_u32(dev, "size", &val) == 0 || - device_property_read_u32(dev, "at25,byte-len", &val) == 0) { - chip->byte_len = val; - } else { + err = device_property_read_u32(dev, "size", &val); + if (err) + err = device_property_read_u32(dev, "at25,byte-len", &val); + if (err) { dev_err(dev, "Error: missing \"size\" property\n"); - return -ENODEV; + return err; } + chip->byte_len = val; - if (device_property_read_u32(dev, "pagesize", &val) == 0 || - device_property_read_u32(dev, "at25,page-size", &val) == 0) { - chip->page_size = val; - } else { + err = device_property_read_u32(dev, "pagesize", &val); + if (err) + err = device_property_read_u32(dev, "at25,page-size", &val); + if (err) { dev_err(dev, "Error: missing \"pagesize\" property\n"); - return -ENODEV; + return err; } - - if (device_property_read_u32(dev, "at25,addr-mode", &val) == 0) { - chip->flags = (u16)val; - } else { - if (device_property_read_u32(dev, "address-width", &val)) { - dev_err(dev, - "Error: missing \"address-width\" property\n"); - return -ENODEV; + chip->page_size = val; + + err = device_property_read_u32(dev, "at25,addr-mode", &val); + if (err) { + err = device_property_read_u32(dev, "address-width", &val); + if (err) { + dev_err(dev, "Error: missing \"address-width\" property\n"); + return err; } switch (val) { case 9: @@ -353,6 +355,8 @@ static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip) } if (device_property_present(dev, "read-only")) chip->flags |= EE_READONLY; + } else { + chip->flags = (u16)val; } return 0; } -- cgit v1.2.3 From fb422f44778df10d2f37c69fbfeeddd40aedae10 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:31:56 +0200 Subject: misc: at25: Check new property ("address-width") first As it's done elsewhere in at25_fw_to_chip() check new property ("address-width") first. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 027840c73fc8..86f5433d0278 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -327,13 +327,15 @@ static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip) } chip->page_size = val; - err = device_property_read_u32(dev, "at25,addr-mode", &val); + err = device_property_read_u32(dev, "address-width", &val); if (err) { - err = device_property_read_u32(dev, "address-width", &val); + err = device_property_read_u32(dev, "at25,addr-mode", &val); if (err) { dev_err(dev, "Error: missing \"address-width\" property\n"); return err; } + chip->flags = (u16)val; + } else { switch (val) { case 9: chip->flags |= EE_INSTR_BIT3_IS_ADDR; @@ -355,8 +357,6 @@ static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip) } if (device_property_present(dev, "read-only")) chip->flags |= EE_READONLY; - } else { - chip->flags = (u16)val; } return 0; } -- cgit v1.2.3 From 994233e195aaa53f30ca1722a280c5295f8782ce Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:31:57 +0200 Subject: misc: at25: Get platform data via dev_get_platdata() Access to platform data via dev_get_platdata() getter to make code cleaner. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 86f5433d0278..b235f20c56da 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -378,7 +378,7 @@ MODULE_DEVICE_TABLE(spi, at25_spi_ids); static int at25_probe(struct spi_device *spi) { struct at25_data *at25 = NULL; - struct spi_eeprom chip; + struct spi_eeprom chip, *pdata; int err; int sr; u8 id[FM25_ID_LEN]; @@ -393,7 +393,8 @@ static int at25_probe(struct spi_device *spi) is_fram = false; /* Chip description */ - if (!spi->dev.platform_data) { + pdata = dev_get_platdata(&spi->dev); + if (!pdata) { if (is_fram) { /* We file fields for FRAM case later on */ memset(&chip, 0, sizeof(chip)); @@ -403,7 +404,7 @@ static int at25_probe(struct spi_device *spi) return err; } } else - chip = *(struct spi_eeprom *)spi->dev.platform_data; + chip = *pdata; /* Ping the chip ... the status register is pretty portable, * unlike probing manufacturer IDs. We do expect that system -- cgit v1.2.3 From 01d3c42a08021617ad8ee79b0a9fed91d68e32b6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:31:58 +0200 Subject: misc: at25: Get rid of intermediate storage for AT25 chip data There is no need to copy twice the same data. Drop needless local variable. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-6-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index b235f20c56da..70cab386040a 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -306,7 +306,6 @@ static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip) u32 val; int err; - memset(chip, 0, sizeof(*chip)); strncpy(chip->name, "at25", sizeof(chip->name)); err = device_property_read_u32(dev, "size", &val); @@ -378,9 +377,9 @@ MODULE_DEVICE_TABLE(spi, at25_spi_ids); static int at25_probe(struct spi_device *spi) { struct at25_data *at25 = NULL; - struct spi_eeprom chip, *pdata; int err; int sr; + struct spi_eeprom *pdata; u8 id[FM25_ID_LEN]; u8 sernum[FM25_SN_LEN]; bool is_fram; @@ -392,20 +391,6 @@ static int at25_probe(struct spi_device *spi) else is_fram = false; - /* Chip description */ - pdata = dev_get_platdata(&spi->dev); - if (!pdata) { - if (is_fram) { - /* We file fields for FRAM case later on */ - memset(&chip, 0, sizeof(chip)); - } else { - err = at25_fw_to_chip(&spi->dev, &chip); - if (err) - return err; - } - } else - chip = *pdata; - /* Ping the chip ... the status register is pretty portable, * unlike probing manufacturer IDs. We do expect that system * firmware didn't write it in the past few milliseconds! @@ -421,10 +406,23 @@ static int at25_probe(struct spi_device *spi) return -ENOMEM; mutex_init(&at25->lock); - at25->chip = chip; at25->spi = spi; spi_set_drvdata(spi, at25); + /* Chip description */ + pdata = dev_get_platdata(&spi->dev); + if (pdata) { + at25->chip = *pdata; + } else { + if (is_fram) { + /* We file fields for FRAM case later on */ + } else { + err = at25_fw_to_chip(&spi->dev, &at25->chip); + if (err) + return err; + } + } + if (is_fram) { /* Get ID of chip */ fm25_aux_read(at25, id, FM25_RDID, FM25_ID_LEN); -- cgit v1.2.3 From d059ed1ba27bf0606471ac407008ddd1f65c4be4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:31:59 +0200 Subject: misc: at25: Switch to use BIT() instead of custom approaches It's obvious that custom approach of getting power of 2 number with int_pow() kinda interesting. Replace it and some others approaches by using a simple BIT() operation. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-7-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 70cab386040a..c9660a4625ce 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 David Brownell */ +#include #include #include #include @@ -17,7 +18,6 @@ #include #include #include -#include /* * NOTE: this is an *EEPROM* driver. The vagaries of product naming @@ -94,7 +94,7 @@ static int at25_ee_read(void *priv, unsigned int offset, instr = AT25_READ; if (at25->chip.flags & EE_INSTR_BIT3_IS_ADDR) - if (offset >= (1U << (at25->addrlen * 8))) + if (offset >= BIT(at25->addrlen * 8)) instr |= AT25_INSTR_BIT3; *cp++ = instr; @@ -227,7 +227,7 @@ static int at25_ee_write(void *priv, unsigned int off, void *val, size_t count) instr = AT25_WRITE; if (at25->chip.flags & EE_INSTR_BIT3_IS_ADDR) - if (offset >= (1U << (at25->addrlen * 8))) + if (offset >= BIT(at25->addrlen * 8)) instr |= AT25_INSTR_BIT3; *cp++ = instr; @@ -437,7 +437,7 @@ static int at25_probe(struct spi_device *spi) return -ENODEV; } - at25->chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024; + at25->chip.byte_len = BIT(id[7] - 0x21 + 4) * 1024; if (at25->chip.byte_len > 64 * 1024) at25->chip.flags |= EE_ADDR3; else -- cgit v1.2.3 From 31a45d27c9328b9c8193f01d7d534659a03cee2d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:32:00 +0200 Subject: misc: at25: Factor out at_fram_to_chip() In the similar way as it's done for EEPROM, factor out a new helper function for FRAM. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-8-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 85 ++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 41 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index c9660a4625ce..b9d26c9ee768 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -31,9 +31,9 @@ #define FM25_SN_LEN 8 /* serial number length */ struct at25_data { + struct spi_eeprom chip; struct spi_device *spi; struct mutex lock; - struct spi_eeprom chip; unsigned addrlen; struct nvmem_config nvmem_config; struct nvmem_device *nvmem; @@ -360,6 +360,44 @@ static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip) return 0; } +static int at25_fram_to_chip(struct device *dev, struct spi_eeprom *chip) +{ + struct at25_data *at25 = container_of(chip, struct at25_data, chip); + u8 sernum[FM25_SN_LEN]; + u8 id[FM25_ID_LEN]; + int i; + + strncpy(chip->name, "fm25", sizeof(chip->name)); + + /* Get ID of chip */ + fm25_aux_read(at25, id, FM25_RDID, FM25_ID_LEN); + if (id[6] != 0xc2) { + dev_err(dev, "Error: no Cypress FRAM (id %02x)\n", id[6]); + return -ENODEV; + } + /* Set size found in ID */ + if (id[7] < 0x21 || id[7] > 0x26) { + dev_err(dev, "Error: unsupported size (id %02x)\n", id[7]); + return -ENODEV; + } + + chip->byte_len = BIT(id[7] - 0x21 + 4) * 1024; + if (chip->byte_len > 64 * 1024) + chip->flags |= EE_ADDR3; + else + chip->flags |= EE_ADDR2; + + if (id[8]) { + fm25_aux_read(at25, sernum, FM25_RDSN, FM25_SN_LEN); + /* Swap byte order */ + for (i = 0; i < FM25_SN_LEN; i++) + at25->sernum[i] = sernum[FM25_SN_LEN - 1 - i]; + } + + chip->page_size = PAGE_SIZE; + return 0; +} + static const struct of_device_id at25_of_match[] = { { .compatible = "atmel,at25",}, { .compatible = "cypress,fm25",}, @@ -380,10 +418,7 @@ static int at25_probe(struct spi_device *spi) int err; int sr; struct spi_eeprom *pdata; - u8 id[FM25_ID_LEN]; - u8 sernum[FM25_SN_LEN]; bool is_fram; - int i; err = device_property_match_string(&spi->dev, "compatible", "cypress,fm25"); if (err >= 0) @@ -414,44 +449,12 @@ static int at25_probe(struct spi_device *spi) if (pdata) { at25->chip = *pdata; } else { - if (is_fram) { - /* We file fields for FRAM case later on */ - } else { - err = at25_fw_to_chip(&spi->dev, &at25->chip); - if (err) - return err; - } - } - - if (is_fram) { - /* Get ID of chip */ - fm25_aux_read(at25, id, FM25_RDID, FM25_ID_LEN); - if (id[6] != 0xc2) { - dev_err(&spi->dev, - "Error: no Cypress FRAM (id %02x)\n", id[6]); - return -ENODEV; - } - /* set size found in ID */ - if (id[7] < 0x21 || id[7] > 0x26) { - dev_err(&spi->dev, "Error: unsupported size (id %02x)\n", id[7]); - return -ENODEV; - } - - at25->chip.byte_len = BIT(id[7] - 0x21 + 4) * 1024; - if (at25->chip.byte_len > 64 * 1024) - at25->chip.flags |= EE_ADDR3; + if (is_fram) + err = at25_fram_to_chip(&spi->dev, &at25->chip); else - at25->chip.flags |= EE_ADDR2; - - if (id[8]) { - fm25_aux_read(at25, sernum, FM25_RDSN, FM25_SN_LEN); - /* swap byte order */ - for (i = 0; i < FM25_SN_LEN; i++) - at25->sernum[i] = sernum[FM25_SN_LEN - 1 - i]; - } - - at25->chip.page_size = PAGE_SIZE; - strncpy(at25->chip.name, "fm25", sizeof(at25->chip.name)); + err = at25_fw_to_chip(&spi->dev, &at25->chip); + if (err) + return err; } /* For now we only support 8/16/24 bit addressing */ -- cgit v1.2.3 From d5fb1304acfd9b8077485c9fb1bf94c8218fd899 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:32:01 +0200 Subject: misc: at25: Reorganize headers for better maintenance Split headers to three groups and sort alphabetically in each of them. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-9-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index b9d26c9ee768..3e60124d14a3 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -7,17 +7,18 @@ */ #include -#include -#include -#include #include #include +#include +#include +#include #include +#include -#include -#include #include -#include +#include + +#include /* * NOTE: this is an *EEPROM* driver. The vagaries of product naming -- cgit v1.2.3 From d6471ab9ab5814489ed2ebd8c554232b59ac571b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:32:02 +0200 Subject: misc: at25: Replace commas by spaces in the ID tables For better readability replace commas by spaces in the ID tables. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-10-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 3e60124d14a3..9264bb17963e 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -400,15 +400,15 @@ static int at25_fram_to_chip(struct device *dev, struct spi_eeprom *chip) } static const struct of_device_id at25_of_match[] = { - { .compatible = "atmel,at25",}, - { .compatible = "cypress,fm25",}, + { .compatible = "atmel,at25" }, + { .compatible = "cypress,fm25" }, { } }; MODULE_DEVICE_TABLE(of, at25_of_match); static const struct spi_device_id at25_spi_ids[] = { - { .name = "at25",}, - { .name = "fm25",}, + { .name = "at25" }, + { .name = "fm25" }, { } }; MODULE_DEVICE_TABLE(spi, at25_spi_ids); -- cgit v1.2.3 From 1ca54ce9a3ff157b93402a7fea52595d029daa8d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 25 Nov 2021 23:32:03 +0200 Subject: misc: at25: Align comment style Make multi-line comment style aligned. While at it, drop filename from the file. Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211125213203.86693-11-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 9264bb17963e..f16f67baf3d2 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * at25.c -- support most SPI EEPROMs, such as Atmel AT25 models - * and Cypress FRAMs FM25 models + * Driver for most of the SPI EEPROMs, such as Atmel AT25 models + * and Cypress FRAMs FM25 models. * * Copyright (C) 2006 David Brownell */ @@ -21,7 +21,7 @@ #include /* - * NOTE: this is an *EEPROM* driver. The vagaries of product naming + * NOTE: this is an *EEPROM* driver. The vagaries of product naming * mean that some AT25 products are EEPROMs, and others are FLASH. * Handle FLASH chips with the drivers/mtd/devices/m25p80.c driver, * not this one! @@ -57,13 +57,14 @@ struct at25_data { #define AT25_SR_BP1 0x08 #define AT25_SR_WPEN 0x80 /* writeprotect enable */ -#define AT25_INSTR_BIT3 0x08 /* Additional address bit in instr */ +#define AT25_INSTR_BIT3 0x08 /* additional address bit in instr */ #define FM25_ID_LEN 9 /* ID length */ #define EE_MAXADDRLEN 3 /* 24 bit addresses, up to 2 MBytes */ -/* Specs often allow 5 msec for a page write, sometimes 20 msec; +/* + * Specs often allow 5ms for a page write, sometimes 20ms; * it's important to recover from write timeouts. */ #define EE_TIMEOUT 25 @@ -108,7 +109,7 @@ static int at25_ee_read(void *priv, unsigned int offset, *cp++ = offset >> 8; fallthrough; case 1: - case 0: /* can't happen: for better codegen */ + case 0: /* can't happen: for better code generation */ *cp++ = offset >> 0; } @@ -125,11 +126,12 @@ static int at25_ee_read(void *priv, unsigned int offset, mutex_lock(&at25->lock); - /* Read it all at once. + /* + * Read it all at once. * * REVISIT that's potentially a problem with large chips, if * other devices on the bus need to be accessed regularly or - * this chip is clocked very slowly + * this chip is clocked very slowly. */ status = spi_sync(at25->spi, &m); dev_dbg(&at25->spi->dev, "read %zu bytes at %d --> %zd\n", @@ -139,9 +141,7 @@ static int at25_ee_read(void *priv, unsigned int offset, return status; } -/* - * read extra registers as ID or serial number - */ +/* Read extra registers as ID or serial number */ static int fm25_aux_read(struct at25_data *at25, u8 *buf, uint8_t command, int len) { @@ -207,7 +207,8 @@ static int at25_ee_write(void *priv, unsigned int off, void *val, size_t count) if (!bounce) return -ENOMEM; - /* For write, rollover is within the page ... so we write at + /* + * For write, rollover is within the page ... so we write at * most one page, then manually roll over to the next page. */ mutex_lock(&at25->lock); @@ -241,7 +242,7 @@ static int at25_ee_write(void *priv, unsigned int off, void *val, size_t count) *cp++ = offset >> 8; fallthrough; case 1: - case 0: /* can't happen: for better codegen */ + case 0: /* can't happen: for better code generation */ *cp++ = offset >> 0; } @@ -257,8 +258,9 @@ static int at25_ee_write(void *priv, unsigned int off, void *val, size_t count) if (status < 0) break; - /* REVISIT this should detect (or prevent) failed writes - * to readonly sections of the EEPROM... + /* + * REVISIT this should detect (or prevent) failed writes + * to read-only sections of the EEPROM... */ /* Wait for non-busy status */ @@ -427,8 +429,9 @@ static int at25_probe(struct spi_device *spi) else is_fram = false; - /* Ping the chip ... the status register is pretty portable, - * unlike probing manufacturer IDs. We do expect that system + /* + * Ping the chip ... the status register is pretty portable, + * unlike probing manufacturer IDs. We do expect that system * firmware didn't write it in the past few milliseconds! */ sr = spi_w8r8(spi, AT25_RDSR); -- cgit v1.2.3 From d325537b88f504bcfdcc61055ad36ff0cb6d7d0b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 12 Nov 2021 11:06:33 +0100 Subject: mei: Remove some dead code 'generated' is known to be true here, so "true || whatever" will still be true. So, remove some dead code. Acked-by: Tomas Winkler Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/3f904c291f3eed06223dd8d494028e0d49df6f10.1636711522.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-txe.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/mei/hw-txe.c b/drivers/misc/mei/hw-txe.c index a4e854b9b9e6..00652c137cc7 100644 --- a/drivers/misc/mei/hw-txe.c +++ b/drivers/misc/mei/hw-txe.c @@ -994,11 +994,7 @@ static bool mei_txe_check_and_ack_intrs(struct mei_device *dev, bool do_ack) hhisr &= ~IPC_HHIER_SEC; } - generated = generated || - (hisr & HISR_INT_STS_MSK) || - (ipc_isr & SEC_IPC_HOST_INT_STATUS_PENDING); - - if (generated && do_ack) { + if (do_ack) { /* Save the interrupt causes */ hw->intr_cause |= hisr & HISR_INT_STS_MSK; if (ipc_isr & SEC_IPC_HOST_INT_STATUS_IN_RDY) -- cgit v1.2.3 From 2925fc1c102943a2496e13ef78d68acd5fd0dc99 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Wed, 8 Dec 2021 15:05:41 +0100 Subject: misc: sram: Add compatible string for Tegra234 SYSRAM We want to use the same behavior as on Tegra186 and Tegra194, so add this the compatible string for Tegra234 SYSRAM to the list. Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding Link: https://lore.kernel.org/r/20211208140541.520238-1-thierry.reding@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/sram.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c index 4c26b19f5154..f0e7f02605eb 100644 --- a/drivers/misc/sram.c +++ b/drivers/misc/sram.c @@ -371,6 +371,7 @@ static const struct of_device_id sram_dt_ids[] = { { .compatible = "atmel,sama5d2-securam", .data = &atmel_securam_config }, { .compatible = "nvidia,tegra186-sysram", .data = &tegra_sysram_config }, { .compatible = "nvidia,tegra194-sysram", .data = &tegra_sysram_config }, + { .compatible = "nvidia,tegra234-sysram", .data = &tegra_sysram_config }, {} }; -- cgit v1.2.3 From 861dc0d7fd972f2064ff48b211955717163a11e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 12 Sep 2021 15:27:52 -0700 Subject: lkdtm: Note that lkdtm_kernel_info should be removed in the future As per Linus's request, remove lkdtm_kernel_info once sufficient reporting exists in CI systems: https://lore.kernel.org/lkml/CAHk-=wiFvfkoFixTapvvyPMN9pq5G-+Dys2eSyBa1vzDGAO5+A@mail.gmail.com Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Signed-off-by: Kees Cook --- drivers/misc/lkdtm/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index 609d9ee2acc0..d4c6cdced37b 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -212,7 +212,11 @@ module_param(cpoint_count, int, 0644); MODULE_PARM_DESC(cpoint_count, " Crash Point Count, number of times the "\ "crash point is to be hit to trigger action"); -/* For test debug reporting. */ +/* + * For test debug reporting when CI systems provide terse summaries. + * TODO: Remove this once reasonable reporting exists in most CI systems: + * https://lore.kernel.org/lkml/CAHk-=wiFvfkoFixTapvvyPMN9pq5G-+Dys2eSyBa1vzDGAO5+A@mail.gmail.com + */ char *lkdtm_kernel_info; /* Return the crashtype number or NULL if the name is invalid */ -- cgit v1.2.3 From 026c6fa1a525ca3f8a615052e45d766208989597 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 7 Oct 2021 10:12:35 +0200 Subject: lkdtm: avoid printk() in recursive_loop() The recursive_loop() function is intended as a diagnostic to ensure that exhausting the stack is caught and mitigated. Currently, it uses pr_info() to ensure that the function has side effects that the compiler cannot simply optimize away, so that the stack footprint does not get reduced inadvertently. The typical mitigation for stack overflow is to kill the task, and this overflow may occur inside the call to pr_info(), which means it could be holding the console lock when this happens. This means that the console lock is never going to be released again, preventing the diagnostic prints related to the stack overflow handling from being visible on the console. So let's replace the call to pr_info() with a call to memzero_explicit(), which is not a 'magic' function name like memset() or memcpy(), which the compiler may replace with plain loads and stores. To ensure that the stack frames are nested rather than tail-called, put the call to memzero_explicit() after the recursive call. Signed-off-by: Ard Biesheuvel Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20211007081235.382697-1-ardb@kernel.org --- drivers/misc/lkdtm/bugs.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index f4cb94a9aa9c..f21854ac5cc2 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -41,20 +41,22 @@ static DEFINE_SPINLOCK(lock_me_up); * Make sure compiler does not optimize this function or stack frame away: * - function marked noinline * - stack variables are marked volatile - * - stack variables are written (memset()) and read (pr_info()) - * - function has external effects (pr_info()) - * */ + * - stack variables are written (memset()) and read (buf[..] passed as arg) + * - function may have external effects (memzero_explicit()) + * - no tail recursion possible + */ static int noinline recursive_loop(int remaining) { volatile char buf[REC_STACK_SIZE]; + volatile int ret; memset((void *)buf, remaining & 0xFF, sizeof(buf)); - pr_info("loop %d/%d ...\n", (int)buf[remaining % sizeof(buf)], - recur_count); if (!remaining) - return 0; + ret = 0; else - return recursive_loop(remaining - 1); + ret = recursive_loop((int)buf[remaining % sizeof(buf)] - 1); + memzero_explicit((void *)buf, sizeof(buf)); + return ret; } /* If the depth is negative, use the default, otherwise keep parameter. */ -- cgit v1.2.3 From bc93a22a19eb2b68a16ecf04cdf4b2ed65aaf398 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 8 Oct 2021 18:58:40 +0200 Subject: lkdtm: Fix content of section containing lkdtm_rodata_do_nothing() On a kernel without CONFIG_STRICT_KERNEL_RWX, running EXEC_RODATA test leads to "Illegal instruction" failure. Looking at the content of rodata_objcopy.o, we see that the function content zeroes only: Disassembly of section .rodata: 0000000000000000 <.lkdtm_rodata_do_nothing>: 0: 00 00 00 00 .long 0x0 Add the contents flag in order to keep the content of the section while renaming it. Disassembly of section .rodata: 0000000000000000 <.lkdtm_rodata_do_nothing>: 0: 4e 80 00 20 blr Fixes: e9e08a07385e ("lkdtm: support llvm-objcopy") Cc: stable@vger.kernel.org Cc: Kees Cook Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Nick Desaulniers Cc: Nathan Chancellor Signed-off-by: Christophe Leroy Reviewed-by: Nick Desaulniers Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/8900731fbc05fb8b0de18af7133a8fc07c3c53a1.1633712176.git.christophe.leroy@csgroup.eu --- drivers/misc/lkdtm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile index aa12097668d3..e2984ce51fe4 100644 --- a/drivers/misc/lkdtm/Makefile +++ b/drivers/misc/lkdtm/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_rodata.o += $(CC_FLAGS_LTO) OBJCOPYFLAGS := OBJCOPYFLAGS_rodata_objcopy.o := \ - --rename-section .noinstr.text=.rodata,alloc,readonly,load + --rename-section .noinstr.text=.rodata,alloc,readonly,load,contents targets += rodata.o rodata_objcopy.o $(obj)/rodata_objcopy.o: $(obj)/rodata.o FORCE $(call if_changed,objcopy) -- cgit v1.2.3 From 2d2802fb24de8cbacb4a2d6da2e002acc1c17143 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Mon, 6 Dec 2021 18:47:24 +0800 Subject: uacce: use sysfs_emit instead of sprintf Use the sysfs_emit to replace sprintf. sprintf may cause output defect in sysfs content, it is better to use new added sysfs_emit function which knows the size of the temporary buffer. Signed-off-by: Kai Ye Link: https://lore.kernel.org/r/20211206104724.11559-1-yekai13@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/uacce/uacce.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index 488eeb2811ae..281c54003edc 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -289,7 +289,7 @@ static ssize_t api_show(struct device *dev, { struct uacce_device *uacce = to_uacce_device(dev); - return sprintf(buf, "%s\n", uacce->api_ver); + return sysfs_emit(buf, "%s\n", uacce->api_ver); } static ssize_t flags_show(struct device *dev, @@ -297,7 +297,7 @@ static ssize_t flags_show(struct device *dev, { struct uacce_device *uacce = to_uacce_device(dev); - return sprintf(buf, "%u\n", uacce->flags); + return sysfs_emit(buf, "%u\n", uacce->flags); } static ssize_t available_instances_show(struct device *dev, @@ -309,7 +309,7 @@ static ssize_t available_instances_show(struct device *dev, if (!uacce->ops->get_available_instances) return -ENODEV; - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", uacce->ops->get_available_instances(uacce)); } @@ -318,7 +318,7 @@ static ssize_t algorithms_show(struct device *dev, { struct uacce_device *uacce = to_uacce_device(dev); - return sprintf(buf, "%s\n", uacce->algs); + return sysfs_emit(buf, "%s\n", uacce->algs); } static ssize_t region_mmio_size_show(struct device *dev, @@ -326,7 +326,7 @@ static ssize_t region_mmio_size_show(struct device *dev, { struct uacce_device *uacce = to_uacce_device(dev); - return sprintf(buf, "%lu\n", + return sysfs_emit(buf, "%lu\n", uacce->qf_pg_num[UACCE_QFRT_MMIO] << PAGE_SHIFT); } @@ -335,7 +335,7 @@ static ssize_t region_dus_size_show(struct device *dev, { struct uacce_device *uacce = to_uacce_device(dev); - return sprintf(buf, "%lu\n", + return sysfs_emit(buf, "%lu\n", uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT); } -- cgit v1.2.3 From 6da3f33770e08348691d90455ef6149e15551854 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 15 Dec 2021 12:18:42 +0100 Subject: misc: vmw_vmci: Switch to kvfree_rcu() API Instead of invoking a synchronize_rcu() to free a pointer after a grace period we can directly make use of new API that does the same but in more efficient way. Signed-off-by: Uladzislau Rezki (Sony) Link: https://lore.kernel.org/r/20211215111845.2514-6-urezki@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_vmci/vmci_context.c | 6 ++---- drivers/misc/vmw_vmci/vmci_event.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c index c0b5e339d5a1..6cf3e21c7604 100644 --- a/drivers/misc/vmw_vmci/vmci_context.c +++ b/drivers/misc/vmw_vmci/vmci_context.c @@ -687,10 +687,8 @@ int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid) } spin_unlock(&context->lock); - if (found) { - synchronize_rcu(); - kfree(notifier); - } + if (found) + kvfree_rcu(notifier); vmci_ctx_put(context); diff --git a/drivers/misc/vmw_vmci/vmci_event.c b/drivers/misc/vmw_vmci/vmci_event.c index e3436abf39f4..2100297c94ad 100644 --- a/drivers/misc/vmw_vmci/vmci_event.c +++ b/drivers/misc/vmw_vmci/vmci_event.c @@ -209,8 +209,7 @@ int vmci_event_unsubscribe(u32 sub_id) if (!s) return VMCI_ERROR_NOT_FOUND; - synchronize_rcu(); - kfree(s); + kvfree_rcu(s); return VMCI_SUCCESS; } -- cgit v1.2.3 From 81e7b7f5dfbdadab1ac9e0c60b0e30633bab1183 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 15 Dec 2021 06:04:38 +0000 Subject: drivers/misc/ocxl: remove redundant rc variable Return value from ocxl_context_attach() directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Acked-by: Andrew Donnellan Signed-off-by: Minghao Chi Link: https://lore.kernel.org/r/20211215060438.441918-1-chi.minghao@zte.com.cn Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ocxl/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index e70525eedaae..d881f5e40ad9 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -74,7 +74,6 @@ static long afu_ioctl_attach(struct ocxl_context *ctx, { struct ocxl_ioctl_attach arg; u64 amr = 0; - int rc; pr_debug("%s for context %d\n", __func__, ctx->pasid); @@ -86,8 +85,7 @@ static long afu_ioctl_attach(struct ocxl_context *ctx, return -EINVAL; amr = arg.amr & mfspr(SPRN_UAMOR); - rc = ocxl_context_attach(ctx, amr, current->mm); - return rc; + return ocxl_context_attach(ctx, amr, current->mm); } static long afu_ioctl_get_metadata(struct ocxl_context *ctx, -- cgit v1.2.3 From c9d1383c75c95be55d9207e8a8d5c7c1659a029e Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 17 Oct 2021 08:40:28 +0300 Subject: habanalabs: modify wait for boot fit in dynamic FW load In the dynamic FW load protocol the boot status is updated to "Ready to Boot" once uboot is active. Polling on other boot status values is a residue of code duplication from the static protocol and should be removed. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 4e68fb9d2a6b..025707a21882 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -2060,7 +2060,6 @@ static int hl_fw_dynamic_wait_for_boot_fit_active(struct hl_device *hdev, hdev, le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status), status, - (status == CPU_BOOT_STATUS_NIC_FW_RDY) || (status == CPU_BOOT_STATUS_READY_TO_BOOT), FW_CPU_STATUS_POLL_INTERVAL_USEC, dyn_loader->wait_for_bl_timeout); -- cgit v1.2.3 From 4cd454a205069965463515e2068190f56b0e4206 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 21 Oct 2021 14:02:40 +0300 Subject: habanalabs/gaudi: recover from CPU WD event There are rare cases where the device CPU's watchdog has expired and as a result, the watchdog reset has happened and the CPU will now move to running its preboot f/w. When that happens, the driver will only know that a heartbeat failure occurred. As a result, the driver will send a message to the CPU's main f/w asking it to reset the device, but because the CPU is now running preboot, it won't respond and the re-initialization process will later fail when trying to load the f/w. The solution is to send the request to the preboot as well, only if the reset was caused because of HB failure. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 825737dfe381..d2b7ecb45497 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2020 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -4296,6 +4296,24 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset WREG32(irq_handler_offset, gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id); + + /* This is a hail-mary attempt to revive the card in the small chance that the + * f/w has experienced a watchdog event, which caused it to return back to preboot. + * In that case, triggering reset through GIC won't help. We need to trigger the + * reset as if Linux wasn't loaded. + * + * We do it only if the reset cause was HB, because that would be the indication + * of such an event. + * + * In case watchdog hasn't expired but we still got HB, then this won't do any + * damage. + */ + if (hdev->curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) { + if (hdev->asic_prop.hard_reset_done_by_fw) + hl_fw_ask_hard_reset_without_linux(hdev); + else + hl_fw_ask_halt_machine_without_linux(hdev); + } } else { if (hdev->asic_prop.hard_reset_done_by_fw) hl_fw_ask_hard_reset_without_linux(hdev); -- cgit v1.2.3 From ba3aca31f91ceef072970c1688bff40afc2ea275 Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Thu, 14 Oct 2021 12:10:31 +0300 Subject: habanalabs: print va_range in vm node debugfs VA range info could assist in debugging VA allocation bugs. Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/debugfs.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 1f2a3dc6c4e2..a239c5679f95 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -235,6 +235,8 @@ static int vm_show(struct seq_file *s, void *data) struct hl_vm_hash_node *hnode; struct hl_userptr *userptr; struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; + struct hl_va_range *va_range; + struct hl_vm_va_block *va_block; enum vm_type *vm_type; bool once = true; u64 j; @@ -314,6 +316,29 @@ static int vm_show(struct seq_file *s, void *data) spin_unlock(&dev_entry->ctx_mem_hash_spinlock); + mutex_lock(&dev_entry->hdev->fpriv_list_lock); + ctx = dev_entry->hdev->compute_ctx; + if (ctx) + hl_ctx_get(dev_entry->hdev, ctx); + mutex_unlock(&dev_entry->hdev->fpriv_list_lock); + if (ctx) { + seq_puts(s, "\nVA ranges:\n\n"); + for (i = HL_VA_RANGE_TYPE_HOST ; i < HL_VA_RANGE_TYPE_MAX ; ++i) { + va_range = ctx->va_range[i]; + seq_printf(s, " va_range %d\n", i); + seq_puts(s, "---------------------\n"); + mutex_lock(&va_range->lock); + list_for_each_entry(va_block, &va_range->list, node) { + seq_printf(s, "%#16llx - %#16llx (%#llx)\n", + va_block->start, va_block->end, + va_block->size); + } + mutex_unlock(&va_range->lock); + seq_puts(s, "\n"); + } + hl_ctx_put(ctx); + } + if (!once) seq_puts(s, "\n"); -- cgit v1.2.3 From bfd5110682ca75cece49fe0e3f5ef478ec43c9ae Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 17 Oct 2021 09:00:43 +0300 Subject: habanalabs: revise and document use of boot status flags The boot status flag "SRAM available" can be set by f/w Linux (in the general case) or by f/w uboot (in some specific debug scenario) but never by f/w preboot. Hence, when polling the boot status flags in the preboot stage we do not want to poll on "SRAM Avialable". The special case in which uboot set this flag is when we are running special debug scenario without Linux. In this case, at some point during the boot, the uboot relocates its code to the DRAM and then set the specified flag. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 025707a21882..482bed152c39 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1101,7 +1101,6 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev, (status == CPU_BOOT_STATUS_DRAM_RDY) || (status == CPU_BOOT_STATUS_NIC_FW_RDY) || (status == CPU_BOOT_STATUS_READY_TO_BOOT) || - (status == CPU_BOOT_STATUS_SRAM_AVAIL) || (status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT), FW_CPU_STATUS_POLL_INTERVAL_USEC, timeout); @@ -2055,12 +2054,20 @@ static int hl_fw_dynamic_wait_for_boot_fit_active(struct hl_device *hdev, dyn_loader = &fw_loader->dynamic_loader; - /* Make sure CPU boot-loader is running */ + /* + * Make sure CPU boot-loader is running + * Note that the CPU_BOOT_STATUS_SRAM_AVAIL is generally set by Linux + * yet there is a debug scenario in which we loading uboot (without Linux) + * which at later stage is relocated to DRAM. In this case we expect + * uboot to set the CPU_BOOT_STATUS_SRAM_AVAIL and so we add it to the + * poll flags + */ rc = hl_poll_timeout( hdev, le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status), status, - (status == CPU_BOOT_STATUS_READY_TO_BOOT), + (status == CPU_BOOT_STATUS_READY_TO_BOOT) || + (status == CPU_BOOT_STATUS_SRAM_AVAIL), FW_CPU_STATUS_POLL_INTERVAL_USEC, dyn_loader->wait_for_bl_timeout); if (rc) { @@ -2081,7 +2088,7 @@ static int hl_fw_dynamic_wait_for_linux_active(struct hl_device *hdev, dyn_loader = &fw_loader->dynamic_loader; - /* Make sure CPU boot-loader is running */ + /* Make sure CPU linux is running */ rc = hl_poll_timeout( hdev, @@ -2415,7 +2422,14 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev, WREG32(msg_to_cpu_reg, KMD_MSG_NA); } - /* Make sure CPU boot-loader is running */ + /* + * Make sure CPU boot-loader is running + * Note that the CPU_BOOT_STATUS_SRAM_AVAIL is generally set by Linux + * yet there is a debug scenario in which we loading uboot (without Linux) + * which at later stage is relocated to DRAM. In this case we expect + * uboot to set the CPU_BOOT_STATUS_SRAM_AVAIL and so we add it to the + * poll flags + */ rc = hl_poll_timeout( hdev, cpu_boot_status_reg, -- cgit v1.2.3 From 90d283b6726fc2e963042b6884951aa81afd0ff7 Mon Sep 17 00:00:00 2001 From: Guy Zadicario Date: Tue, 12 Oct 2021 10:30:28 +0300 Subject: habanalabs/gaudi: fix debugfs dma channel selection Do not use a dma channel for debugfs requested transfer if it's QM is not idle. Signed-off-by: Guy Zadicario Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index d2b7ecb45497..92d55a0a10c1 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6430,6 +6430,7 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size, { u32 dma_core_sts0, err_cause, cfg1, size_left, pos, size_to_dma; struct gaudi_device *gaudi = hdev->asic_specific; + u32 qm_glbl_sts0, qm_cgm_sts; u64 dma_offset, qm_offset; dma_addr_t dma_addr; void *kernel_addr; @@ -6454,14 +6455,20 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size, dma_offset = dma_id * DMA_CORE_OFFSET; qm_offset = dma_id * DMA_QMAN_OFFSET; dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset); - is_eng_idle = IS_DMA_IDLE(dma_core_sts0); + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); if (!is_eng_idle) { dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2]; dma_offset = dma_id * DMA_CORE_OFFSET; qm_offset = dma_id * DMA_QMAN_OFFSET; dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset); - is_eng_idle = IS_DMA_IDLE(dma_core_sts0); + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); if (!is_eng_idle) { dev_err_ratelimited(hdev->dev, -- cgit v1.2.3 From f06bad02b58733ed9e65b4c8d083270c8e9d0fa7 Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Thu, 14 Oct 2021 10:33:27 +0300 Subject: habanalabs: wrong VA size calculation VA blocks are currently stored in an inconsistent way. Sometimes block end is inclusive, sometimes exclusive. This leads to wrong size calculations in certain cases, plus could lead to a segmentation fault in case mapping process fails in the middle and we try to roll it back. Need to make this consistent - start inclusive till end inclusive. For example, the regions table may now look like this: 0x0000 - 0x1fff : allocated 0x2000 - 0x2fff : free 0x3000 - 0x3fff : allocated Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 2 +- drivers/misc/habanalabs/common/habanalabs.h | 16 ++-------------- drivers/misc/habanalabs/common/memory.c | 22 ++++++++++++++-------- 3 files changed, 17 insertions(+), 23 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 8132a84698d5..41a12bcd26e5 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -57,7 +57,7 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) } va_block->start = virt_addr; - va_block->end = virt_addr + page_size; + va_block->end = virt_addr + page_size - 1; va_block->size = page_size; list_add_tail(&va_block->node, &cb->va_block_list); } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index a2002cbf794b..4f3c228c9b9d 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2757,21 +2757,9 @@ static inline bool hl_mem_area_inside_range(u64 address, u64 size, static inline bool hl_mem_area_crosses_range(u64 address, u32 size, u64 range_start_address, u64 range_end_address) { - u64 end_address = address + size; + u64 end_address = address + size - 1; - if ((address >= range_start_address) && - (address < range_end_address)) - return true; - - if ((end_address >= range_start_address) && - (end_address < range_end_address)) - return true; - - if ((address < range_start_address) && - (end_address >= range_end_address)) - return true; - - return false; + return ((address <= range_end_address) && (range_start_address <= end_address)); } int hl_device_open(struct inode *inode, struct file *filp); diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 9bd626a00de3..1185f9aec989 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -477,7 +477,7 @@ static int add_va_block_locked(struct hl_device *hdev, struct list_head *va_list, u64 start, u64 end) { struct hl_vm_va_block *va_block, *res = NULL; - u64 size = end - start; + u64 size = end - start + 1; print_va_list_locked(hdev, va_list); @@ -644,7 +644,7 @@ static u64 get_va_block(struct hl_device *hdev, continue; } - valid_size = va_block->end - valid_start; + valid_size = va_block->end - valid_start + 1; if (valid_size < size) continue; @@ -707,7 +707,7 @@ static u64 get_va_block(struct hl_device *hdev, if (new_va_block->size > size) { new_va_block->start += size; - new_va_block->size = new_va_block->end - new_va_block->start; + new_va_block->size = new_va_block->end - new_va_block->start + 1; } else { list_del(&new_va_block->node); kfree(new_va_block); @@ -2388,8 +2388,14 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range, start += PAGE_SIZE; } - if (end & (PAGE_SIZE - 1)) - end &= PAGE_MASK; + /* + * The end of the range is inclusive, hence we need to align it + * to the end of the last full page in the range. For example if + * end = 0x3ff5 with page size 0x1000, we need to align it to + * 0x2fff. The remainig 0xff5 bytes do not form a full page. + */ + if ((end + 1) & (PAGE_SIZE - 1)) + end = ((end + 1) & PAGE_MASK) - 1; } if (start >= end) { @@ -2564,14 +2570,14 @@ int hl_vm_ctx_init(struct hl_ctx *ctx) return 0; dram_range_start = prop->dmmu.start_addr; - dram_range_end = prop->dmmu.end_addr; + dram_range_end = prop->dmmu.end_addr - 1; dram_page_size = prop->dram_page_size ? prop->dram_page_size : prop->dmmu.page_size; host_range_start = prop->pmmu.start_addr; - host_range_end = prop->pmmu.end_addr; + host_range_end = prop->pmmu.end_addr - 1; host_page_size = prop->pmmu.page_size; host_huge_range_start = prop->pmmu_huge.start_addr; - host_huge_range_end = prop->pmmu_huge.end_addr; + host_huge_range_end = prop->pmmu_huge.end_addr - 1; host_huge_page_size = prop->pmmu_huge.page_size; return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end, -- cgit v1.2.3 From 89d6decdb7346082c1f168a27d1386c34550bbd3 Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Thu, 21 Oct 2021 15:08:51 +0300 Subject: habanalabs: make last_mask an MMU property Currently LAST_MASK is a global, but really it is an MMU implementation specific. We need this change for future ASICs. Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/common/mmu/mmu_v1.c | 10 +++++----- drivers/misc/habanalabs/gaudi/gaudi.c | 1 + drivers/misc/habanalabs/goya/goya.c | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 4f3c228c9b9d..6dd7d9ee7a44 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -382,6 +382,7 @@ enum hl_device_hw_state { * @hop3_mask: mask to get the PTE address in hop 3. * @hop4_mask: mask to get the PTE address in hop 4. * @hop5_mask: mask to get the PTE address in hop 5. + * @last_mask: mask to get the bit indicating this is the last hop. * @page_size: default page size used to allocate memory. * @num_hops: The amount of hops supported by the translation table. * @host_resident: Should the MMU page table reside in host memory or in the @@ -402,6 +403,7 @@ struct hl_mmu_properties { u64 hop3_mask; u64 hop4_mask; u64 hop5_mask; + u64 last_mask; u32 page_size; u32 num_hops; u8 host_resident; diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c index 0f536f79dd9c..159da2fafd79 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu_v1.c +++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c @@ -573,7 +573,7 @@ static int _hl_mmu_v1_unmap(struct hl_ctx *ctx, curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr; - is_huge = curr_pte & LAST_MASK; + is_huge = curr_pte & mmu_prop->last_mask; if (is_dram_addr && !is_huge) { dev_err(hdev->dev, @@ -597,7 +597,7 @@ static int _hl_mmu_v1_unmap(struct hl_ctx *ctx, if (hdev->dram_default_page_mapping && is_dram_addr) { u64 default_pte = (prop->mmu_dram_default_page_addr & - HOP_PHYS_ADDR_MASK) | LAST_MASK | + HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask | PAGE_PRESENT_MASK; if (curr_pte == default_pte) { dev_err(hdev->dev, @@ -729,7 +729,7 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, if (hdev->dram_default_page_mapping && is_dram_addr) { u64 default_pte = (prop->mmu_dram_default_page_addr & - HOP_PHYS_ADDR_MASK) | LAST_MASK | + HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask | PAGE_PRESENT_MASK; if (curr_pte != default_pte) { @@ -769,7 +769,7 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, goto err; } - curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK + curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask | PAGE_PRESENT_MASK; if (is_huge) @@ -930,7 +930,7 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK)) return -EFAULT; - if (hops->hop_info[i].hop_pte_val & LAST_MASK) + if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask) break; } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 92d55a0a10c1..52fffd76f5cf 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -613,6 +613,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1; prop->pmmu.page_size = PAGE_SIZE_4KB; prop->pmmu.num_hops = MMU_ARCH_5_HOPS; + prop->pmmu.last_mask = LAST_MASK; /* PMMU and HPMMU are the same except of page size */ memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu)); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 5536e8c27bd5..59bb12fcc935 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -429,6 +429,7 @@ int goya_set_fixed_properties(struct hl_device *hdev) prop->dmmu.end_addr = VA_DDR_SPACE_END; prop->dmmu.page_size = PAGE_SIZE_2MB; prop->dmmu.num_hops = MMU_ARCH_5_HOPS; + prop->dmmu.last_mask = LAST_MASK; /* shifts and masks are the same in PMMU and DMMU */ memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu)); @@ -436,6 +437,7 @@ int goya_set_fixed_properties(struct hl_device *hdev) prop->pmmu.end_addr = VA_HOST_SPACE_END; prop->pmmu.page_size = PAGE_SIZE_4KB; prop->pmmu.num_hops = MMU_ARCH_5_HOPS; + prop->pmmu.last_mask = LAST_MASK; /* PMMU and HPMMU are the same except of page size */ memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu)); -- cgit v1.2.3 From 82e5169e8adfff331169613808b45a6cfb030e81 Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Thu, 30 Sep 2021 15:52:25 +0300 Subject: habanalabs: add enum mmu_op_flags The enum vm_type was abused, used once as a value (indication memory type for map) and once as a flag (for cache invalidation). This makes it hard to add new and still keep it meaningful, hence it is better to split into one enum for values and one for flags. Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 6 +++--- drivers/misc/habanalabs/common/habanalabs.h | 11 +++++++++++ drivers/misc/habanalabs/common/memory.c | 4 ++-- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/goya/goya.c | 2 +- 5 files changed, 19 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 41a12bcd26e5..fab499d252d4 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -80,7 +80,7 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) offset += va_block->size; } - hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); @@ -97,7 +97,7 @@ err_va_umap: offset -= va_block->size; } - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); @@ -126,7 +126,7 @@ static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb) "Failed to unmap CB's va 0x%llx\n", va_block->start); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 6dd7d9ee7a44..202c7f7948f5 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -352,6 +352,17 @@ enum vm_type { VM_TYPE_PHYS_PACK = 0x2 }; +/** + * enum mmu_op_flags - mmu operation relevant information. + * @MMU_OP_USERPTR: operation on user memory (host resident). + * @MMU_OP_PHYS_PACK: operation on DRAM (device resident). + */ +enum mmu_op_flags { + MMU_OP_USERPTR = 0x1, + MMU_OP_PHYS_PACK = 0x2 +}; + + /** * enum hl_device_hw_state - H/W device state. use this to understand whether * to do reset before hw_init or not diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 1185f9aec989..40f2197388fe 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2639,8 +2639,8 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) mutex_lock(&ctx->mmu_lock); /* invalidate the cache once after the unmapping loop */ - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_PHYS_PACK); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK); mutex_unlock(&ctx->mmu_lock); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 52fffd76f5cf..2e39514ee102 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -8688,7 +8688,7 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev, hdev->internal_cb_pool_dma_addr, HOST_SPACE_INTERNAL_CB_SZ); - hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); if (rc) @@ -8723,7 +8723,7 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev, HOST_SPACE_INTERNAL_CB_SZ); hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base, HOST_SPACE_INTERNAL_CB_SZ); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); gen_pool_destroy(hdev->internal_cb_pool); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 59bb12fcc935..6ee6d5b915a1 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2621,7 +2621,7 @@ int goya_mmu_init(struct hl_device *hdev) (~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK)); hdev->asic_funcs->mmu_invalidate_cache(hdev, true, - VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK); + MMU_OP_USERPTR | MMU_OP_PHYS_PACK); WREG32(mmMMU_MMU_ENABLE, 1); WREG32(mmMMU_SPI_MASK, 0xF); -- cgit v1.2.3 From 6ccba9a3bca95a24fd936e3c3542cf2ff2941b0f Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Mon, 25 Oct 2021 11:37:25 +0300 Subject: habanalabs: partly skip cache flush when in PMMU map flow The PCI MMU cache is two layered. The upper layer, memcache, uses cache lines, the bottom layer doesn't. Hence, after PMMU map operation we have to invalidate memcache, to avoid the situation where the new entry is already in the cache due to its cache line being fully in the cache. However, we do not have to invalidate the lower cache, and here we can optimize, since cache invalidation is time consuming. Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 3 ++- drivers/misc/habanalabs/common/habanalabs.h | 6 +++++- drivers/misc/habanalabs/common/memory.c | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index fab499d252d4..71910f7809bd 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -80,7 +80,8 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) offset += va_block->size; } - hdev->asic_funcs->mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, false, + MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); mutex_unlock(&ctx->mmu_lock); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 202c7f7948f5..aac73c8d2e1d 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -356,10 +356,14 @@ enum vm_type { * enum mmu_op_flags - mmu operation relevant information. * @MMU_OP_USERPTR: operation on user memory (host resident). * @MMU_OP_PHYS_PACK: operation on DRAM (device resident). + * @MMU_OP_CLEAR_MEMCACHE: operation has to clear memcache. + * @MMU_OP_SKIP_LOW_CACHE_INV: operation is allowed to skip parts of cache invalidation. */ enum mmu_op_flags { MMU_OP_USERPTR = 0x1, - MMU_OP_PHYS_PACK = 0x2 + MMU_OP_PHYS_PACK = 0x2, + MMU_OP_CLEAR_MEMCACHE = 0x4, + MMU_OP_SKIP_LOW_CACHE_INV = 0x8, }; diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 40f2197388fe..cd3640617d02 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1202,7 +1202,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, } rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, false, - *vm_type, ctx->asid, ret_vaddr, phys_pg_pack->total_size); + *vm_type | MMU_OP_SKIP_LOW_CACHE_INV, + ctx->asid, ret_vaddr, phys_pg_pack->total_size); mutex_unlock(&ctx->mmu_lock); -- cgit v1.2.3 From 8f82ff75dfd27afecb90246889c0c15d28e15ca7 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Thu, 21 Oct 2021 11:24:41 +0300 Subject: habanalabs: adding indication of boot fit loaded Up until now the driver stored indication if Linux was loaded on the device CPU. This was needed in order to coordinate some tasks that are performed by the Linux. In future ASICs, many of those tasks will be performed by the boot fit, so now we need the same indication of boot fit load status. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 4 ++-- drivers/misc/habanalabs/common/firmware_if.c | 4 +++- drivers/misc/habanalabs/common/habanalabs.h | 7 +++++-- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/goya/goya.c | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 2022e5d7b3ad..9674e2520532 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1138,7 +1138,7 @@ kill_processes: hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); if (hard_reset) { - hdev->fw_loader.linux_loaded = false; + hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; /* Release kernel context */ if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) @@ -1692,7 +1692,7 @@ void hl_device_fini(struct hl_device *hdev) /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, true, false); - hdev->fw_loader.linux_loaded = false; + hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; /* Release kernel context */ if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 482bed152c39..8cbec10cddb1 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1919,6 +1919,8 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev, { struct asic_fixed_properties *prop = &hdev->asic_prop; + hdev->fw_loader.fw_comp_loaded |= FW_TYPE_BOOT_CPU; + /* Clear reset status since we need to read it again from boot CPU */ prop->hard_reset_done_by_fw = false; @@ -2127,7 +2129,7 @@ static void hl_fw_linux_update_state(struct hl_device *hdev, { struct asic_fixed_properties *prop = &hdev->asic_prop; - hdev->fw_loader.linux_loaded = true; + hdev->fw_loader.fw_comp_loaded |= FW_TYPE_LINUX; /* Clear reset status since we need to read again from app */ prop->hard_reset_done_by_fw = false; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index aac73c8d2e1d..b3c6b660c7aa 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -219,6 +219,7 @@ enum hl_fw_component { /** * enum hl_fw_types - F/W types present in the system + * @FW_TYPE_NONE: no FW component indication * @FW_TYPE_LINUX: Linux image for device CPU * @FW_TYPE_BOOT_CPU: Boot image for device CPU * @FW_TYPE_PREBOOT_CPU: Indicates pre-loaded CPUs are present in the system @@ -226,6 +227,7 @@ enum hl_fw_component { * @FW_TYPE_ALL_TYPES: Mask for all types */ enum hl_fw_types { + FW_TYPE_NONE = 0x0, FW_TYPE_LINUX = 0x1, FW_TYPE_BOOT_CPU = 0x2, FW_TYPE_PREBOOT_CPU = 0x4, @@ -1059,7 +1061,8 @@ struct fw_image_props { * @skip_bmc: should BMC be skipped * @sram_bar_id: SRAM bar ID * @dram_bar_id: DRAM bar ID - * @linux_loaded: true if linux was loaded so far + * @fw_comp_loaded: bitmask of loaded FW components. set bit meaning loaded + * component. values are set according to enum hl_fw_types. */ struct fw_load_mgr { union { @@ -1073,7 +1076,7 @@ struct fw_load_mgr { u8 skip_bmc; u8 sram_bar_id; u8 dram_bar_id; - u8 linux_loaded; + u8 fw_comp_loaded; }; /** diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 2e39514ee102..1dcce1bc976f 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4007,7 +4007,7 @@ static void gaudi_init_firmware_loader(struct hl_device *hdev) struct fw_load_mgr *fw_loader = &hdev->fw_loader; /* fill common fields */ - fw_loader->linux_loaded = false; + fw_loader->fw_comp_loaded = FW_TYPE_NONE; fw_loader->boot_fit_img.image_name = GAUDI_BOOT_FIT_FILE; fw_loader->linux_img.image_name = GAUDI_LINUX_FW_FILE; fw_loader->cpu_timeout = GAUDI_CPU_TIMEOUT_USEC; @@ -4290,7 +4290,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset * via the GIC. Otherwise, we need to use COMMS or the MSG_TO_CPU * registers in case of old F/Ws */ - if (hdev->fw_loader.linux_loaded) { + if (hdev->fw_loader.fw_comp_loaded & FW_TYPE_LINUX) { irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : le32_to_cpu(dyn_regs->gic_host_halt_irq); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 6ee6d5b915a1..ce06103292a0 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2504,7 +2504,7 @@ static void goya_init_firmware_loader(struct hl_device *hdev) struct fw_load_mgr *fw_loader = &hdev->fw_loader; /* fill common fields */ - fw_loader->linux_loaded = false; + fw_loader->fw_comp_loaded = FW_TYPE_NONE; fw_loader->boot_fit_img.image_name = GOYA_BOOT_FIT_FILE; fw_loader->linux_img.image_name = GOYA_LINUX_FW_FILE; fw_loader->cpu_timeout = GOYA_CPU_TIMEOUT_USEC; -- cgit v1.2.3 From f4e7906dbe7e922b057e4533a585f7943fe90c90 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Tue, 26 Oct 2021 15:33:23 +0300 Subject: habanalabs: use variable poll interval for fw loading Using a variable poll interval for fw loading allows us to support much slower environments (emulation) while changing only a single line in the code, instead of choosing a different interval in each function that polls. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 35 ++++++++++++++----------- drivers/misc/habanalabs/common/habanalabs.h | 5 ++++ drivers/misc/habanalabs/common/habanalabs_drv.c | 3 +++ 3 files changed, 27 insertions(+), 16 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 8cbec10cddb1..c68ad4d7b1bb 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -15,8 +15,6 @@ #define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */ -#define FW_CPU_STATUS_POLL_INTERVAL_USEC 10000 - static char *extract_fw_ver_from_str(const char *fw_str) { char *str, *fw_ver, *whitespace; @@ -1102,7 +1100,7 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev, (status == CPU_BOOT_STATUS_NIC_FW_RDY) || (status == CPU_BOOT_STATUS_READY_TO_BOOT) || (status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT), - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, timeout); if (rc) { @@ -1286,11 +1284,7 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg, { int rc; - /* pldm was added for cases in which we use preboot on pldm and want - * to load boot fit, but we can't wait for preboot because it runs - * very slowly - */ - if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) || hdev->pldm) + if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU)) return 0; /* @@ -1436,7 +1430,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev, le32_to_cpu(dyn_regs->cpu_cmd_status_to_host), status, FIELD_GET(COMMS_STATUS_STATUS_MASK, status) == expected_status, - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, timeout); if (rc) { @@ -2070,7 +2064,7 @@ static int hl_fw_dynamic_wait_for_boot_fit_active(struct hl_device *hdev, status, (status == CPU_BOOT_STATUS_READY_TO_BOOT) || (status == CPU_BOOT_STATUS_SRAM_AVAIL), - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, dyn_loader->wait_for_bl_timeout); if (rc) { dev_err(hdev->dev, "failed to wait for boot\n"); @@ -2097,7 +2091,7 @@ static int hl_fw_dynamic_wait_for_linux_active(struct hl_device *hdev, le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status), status, (status == CPU_BOOT_STATUS_SRAM_AVAIL), - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, fw_loader->cpu_timeout); if (rc) { dev_err(hdev->dev, "failed to wait for Linux\n"); @@ -2296,6 +2290,15 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, goto protocol_err; } + /* + * when testing FW load (without Linux) on PLDM we don't want to + * wait until boot fit is active as it may take several hours. + * instead, we load the bootfit and let it do all initializations in + * the background. + */ + if (hdev->pldm && !(hdev->fw_components & FW_TYPE_LINUX)) + return 0; + rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader); if (rc) goto protocol_err; @@ -2388,7 +2391,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev, cpu_boot_status_reg, status, status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT, - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, fw_loader->boot_fit_timeout); if (rc) { @@ -2411,7 +2414,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev, cpu_msg_status_reg, status, status == CPU_MSG_OK, - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, fw_loader->boot_fit_timeout); if (rc) { @@ -2440,7 +2443,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev, (status == CPU_BOOT_STATUS_NIC_FW_RDY) || (status == CPU_BOOT_STATUS_READY_TO_BOOT) || (status == CPU_BOOT_STATUS_SRAM_AVAIL), - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, cpu_timeout); dev_dbg(hdev->dev, "uboot status = %d\n", status); @@ -2489,7 +2492,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev, cpu_boot_status_reg, status, (status == CPU_BOOT_STATUS_BMC_WAITING_SKIPPED), - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, cpu_timeout); if (rc) { @@ -2509,7 +2512,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev, cpu_boot_status_reg, status, (status == CPU_BOOT_STATUS_SRAM_AVAIL), - FW_CPU_STATUS_POLL_INTERVAL_USEC, + hdev->fw_poll_interval_usec, cpu_timeout); /* Clear message */ diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index b3c6b660c7aa..5fc9cfd892e8 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -61,6 +61,9 @@ #define HL_CPUCP_INFO_TIMEOUT_USEC 10000000 /* 10s */ #define HL_CPUCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */ +#define HL_FW_STATUS_POLL_INTERVAL_USEC 10000 /* 10ms */ +#define HL_FW_STATUS_PLDM_POLL_INTERVAL_USEC 300000000 /* 300s */ + #define HL_PCI_ELBI_TIMEOUT_MSEC 10 /* 10ms */ #define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */ @@ -2459,6 +2462,7 @@ struct multi_cs_data { * @last_open_session_duration_jif: duration (jiffies) of the last device open * session. * @open_counter: number of successful device open operations. + * @fw_poll_interval_usec: FW status poll interval in usec. * @in_reset: is device in reset flow. * @curr_pll_profile: current PLL profile. * @card_type: Various ASICs have several card types. This indicates the card @@ -2607,6 +2611,7 @@ struct hl_device { u64 last_successful_open_jif; u64 last_open_session_duration_jif; u64 open_counter; + u64 fw_poll_interval_usec; atomic_t in_reset; enum hl_pll_frequency curr_pll_profile; enum cpucp_card_types card_type; diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 949d1b5c5c41..5989826701bc 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -345,6 +345,9 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, set_driver_behavior_per_device(hdev); + hdev->fw_poll_interval_usec = hdev->pldm ? HL_FW_STATUS_PLDM_POLL_INTERVAL_USEC : + HL_FW_STATUS_POLL_INTERVAL_USEC; + hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; -- cgit v1.2.3 From 5edd95a4abb332fb683cf7a35eed2ae4ff7b4dcb Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Tue, 26 Oct 2021 10:42:24 +0300 Subject: habanalabs: don't clear previous f/w indications Once we read indication of whether f/w is doing the reset, we don't want to clear it, until the next time we read this indication. Otherwise, we might be in a state of wrong indication. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index c68ad4d7b1bb..9addcfba6a8b 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1247,8 +1247,7 @@ static void hl_fw_preboot_update_state(struct hl_device *hdev) * 3. FW application - a. Fetch fw application security status * b. Check whether hard reset is done by fw app */ - prop->hard_reset_done_by_fw = - !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); + prop->hard_reset_done_by_fw = !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); dev_dbg(hdev->dev, "Firmware preboot boot device status0 %#x\n", cpu_boot_dev_sts0); @@ -1915,17 +1914,13 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev, hdev->fw_loader.fw_comp_loaded |= FW_TYPE_BOOT_CPU; - /* Clear reset status since we need to read it again from boot CPU */ - prop->hard_reset_done_by_fw = false; - /* Read boot_cpu status bits */ if (prop->fw_preboot_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_ENABLED) { prop->fw_bootfit_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg); - if (prop->fw_bootfit_cpu_boot_dev_sts0 & - CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) - prop->hard_reset_done_by_fw = true; + prop->hard_reset_done_by_fw = !!(prop->fw_bootfit_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); dev_dbg(hdev->dev, "Firmware boot CPU status0 %#x\n", prop->fw_bootfit_cpu_boot_dev_sts0); @@ -2125,16 +2120,12 @@ static void hl_fw_linux_update_state(struct hl_device *hdev, hdev->fw_loader.fw_comp_loaded |= FW_TYPE_LINUX; - /* Clear reset status since we need to read again from app */ - prop->hard_reset_done_by_fw = false; - /* Read FW application security bits */ if (prop->fw_cpu_boot_dev_sts0_valid) { prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg); - if (prop->fw_app_cpu_boot_dev_sts0 & - CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) - prop->hard_reset_done_by_fw = true; + prop->hard_reset_done_by_fw = !!(prop->fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN) -- cgit v1.2.3 From 138858226414bd026e63acebb7540093c97c69fd Mon Sep 17 00:00:00 2001 From: Bharat Jauhari Date: Wed, 8 Sep 2021 17:16:51 +0300 Subject: habanalabs: handle abort scenario for user interrupt In case of device reset, the driver does a force trigger on all waiting users to release them from waiting. However, the driver does not handle error scenario while waiting. hl_interrupt_wait_ioctl() now exits the wait in case of an error with abort status. Signed-off-by: Bharat Jauhari Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 34 ++++++++++------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 4c8000fd246c..41b48929cd59 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2768,7 +2768,7 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u32 timeout_us, u64 user_address, u64 target_value, u16 interrupt_offset, - enum hl_cs_wait_status *status, + u32 *status, u64 *timestamp) { struct hl_user_pending_interrupt *pend; @@ -2815,13 +2815,14 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, } if (completion_value >= target_value) { - *status = CS_WAIT_STATUS_COMPLETED; + *status = HL_WAIT_CS_STATUS_COMPLETED; /* There was no interrupt, we assume the completion is now. */ pend->fence.timestamp = ktime_get(); - } else - *status = CS_WAIT_STATUS_BUSY; + } else { + *status = HL_WAIT_CS_STATUS_BUSY; + } - if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED)) + if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED)) goto remove_pending_user_interrupt; wait_again: @@ -2850,7 +2851,13 @@ wait_again: } if (completion_value >= target_value) { - *status = CS_WAIT_STATUS_COMPLETED; + *status = HL_WAIT_CS_STATUS_COMPLETED; + } else if (pend->fence.error) { + dev_err_ratelimited(hdev->dev, + "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n", + pend->fence.error); + /* set the command completion status as ABORTED */ + *status = HL_WAIT_CS_STATUS_ABORTED; } else { timeout = completion_rc; goto wait_again; @@ -2861,7 +2868,7 @@ wait_again: interrupt->interrupt_id); rc = -EINTR; } else { - *status = CS_WAIT_STATUS_BUSY; + *status = HL_WAIT_CS_STATUS_BUSY; } remove_pending_user_interrupt: @@ -2883,7 +2890,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) struct hl_device *hdev = hpriv->hdev; struct asic_fixed_properties *prop; union hl_wait_cs_args *args = data; - enum hl_cs_wait_status status; + u32 status = HL_WAIT_CS_STATUS_BUSY; u64 timestamp; int rc; @@ -2926,22 +2933,13 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) } memset(args, 0, sizeof(*args)); + args->out.status = status; if (timestamp) { args->out.timestamp_nsec = timestamp; args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD; } - switch (status) { - case CS_WAIT_STATUS_COMPLETED: - args->out.status = HL_WAIT_CS_STATUS_COMPLETED; - break; - case CS_WAIT_STATUS_BUSY: - default: - args->out.status = HL_WAIT_CS_STATUS_BUSY; - break; - } - return 0; } -- cgit v1.2.3 From e84e31a9123bda35a1e61f391e7c30e8b3a8ea5b Mon Sep 17 00:00:00 2001 From: Rajaravi Krishna Katta Date: Tue, 26 Oct 2021 14:11:06 +0300 Subject: habanalabs: add dedicated message towards f/w to set power CPUCP_PACKET_POWER_GET packet type was used for both hl_get_power() and hl_set_power(). To align with other sensor functions hl_set_power() should use CPUCP_PACKET_POWER_SET. This packet will only be used with newer ASICs, so need to add a compatibility flag to the asic properties to indicate whether to use this packet or the GET packet. Signed-off-by: Rajaravi Krishna Katta Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 3 +++ drivers/misc/habanalabs/common/hwmon.c | 8 +++++++- drivers/misc/habanalabs/gaudi/gaudi.c | 2 ++ drivers/misc/habanalabs/goya/goya.c | 2 ++ drivers/misc/habanalabs/include/common/cpucp_if.h | 4 ++++ 5 files changed, 18 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 5fc9cfd892e8..dc61f7031c38 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -546,6 +546,8 @@ struct hl_hints_range { * @dynamic_fw_load: is dynamic FW load is supported. * @gic_interrupts_enable: true if FW is not blocking GIC controller, * false otherwise. + * @use_get_power_for_reset_history: To support backward compatibility for Goya + * and Gaudi */ struct asic_fixed_properties { struct hw_queue_properties *hw_queues_props; @@ -626,6 +628,7 @@ struct asic_fixed_properties { u8 iatu_done_by_fw; u8 dynamic_fw_load; u8 gic_interrupts_enable; + u8 use_get_power_for_reset_history; }; /** diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c index e33f65be8a00..70182b42940d 100644 --- a/drivers/misc/habanalabs/common/hwmon.c +++ b/drivers/misc/habanalabs/common/hwmon.c @@ -677,12 +677,18 @@ int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value) { struct cpucp_packet pkt; + struct asic_fixed_properties *prop = &hdev->asic_prop; int rc; memset(&pkt, 0, sizeof(pkt)); - pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << + if (prop->use_get_power_for_reset_history) + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); + else + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); pkt.type = __cpu_to_le16(attr); pkt.value = __cpu_to_le64(value); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 1dcce1bc976f..738ad2498439 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -665,6 +665,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->clk_pll_index = HL_GAUDI_MME_PLL; prop->max_freq_value = GAUDI_MAX_CLK_FREQ; + prop->use_get_power_for_reset_history = true; + return 0; } diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ce06103292a0..959eb21dcc69 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -475,6 +475,8 @@ int goya_set_fixed_properties(struct hl_device *hdev) prop->clk_pll_index = HL_GOYA_MME_PLL; + prop->use_get_power_for_reset_history = true; + return 0; } diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index ae13231fda94..17927968e19a 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -376,6 +376,9 @@ enum pq_init_status { * and QMANs. The f/w will return a bitmask where each bit represents * a different engine or QMAN according to enum cpucp_idle_mask. * The bit will be 1 if the engine is NOT idle. + * + * CPUCP_PACKET_POWER_SET - + * Resets power history of device to 0 */ enum cpucp_packet_id { @@ -421,6 +424,7 @@ enum cpucp_packet_id { CPUCP_PACKET_NIC_STAT_REGS_CLR, /* internal */ CPUCP_PACKET_NIC_STAT_REGS_ALL_GET, /* internal */ CPUCP_PACKET_IS_IDLE_CHECK, /* internal */ + CPUCP_PACKET_POWER_SET, /* internal */ }; #define CPUCP_PACKET_FENCE_VAL 0xFE8CE7A5 -- cgit v1.2.3 From 234caa52736b8d413892fb1b2471066dc4b46629 Mon Sep 17 00:00:00 2001 From: Bharat Jauhari Date: Thu, 16 Sep 2021 14:00:38 +0300 Subject: habanalabs: rename reset flags Rename reset flags for better readability as compared to HL_RESET_CAUSE* enum shared with the f/w. Signed-off-by: Bharat Jauhari Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 2 +- drivers/misc/habanalabs/common/device.c | 40 +++++++++++----------- drivers/misc/habanalabs/common/habanalabs.h | 28 +++++++-------- drivers/misc/habanalabs/common/memory.c | 2 +- drivers/misc/habanalabs/common/sysfs.c | 2 +- drivers/misc/habanalabs/gaudi/gaudi.c | 14 ++++---- drivers/misc/habanalabs/goya/goya.c | 10 +++--- 7 files changed, 50 insertions(+), 48 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 41b48929cd59..9ebcd9894d83 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -767,7 +767,7 @@ static void cs_timedout(struct work_struct *work) if (likely(!skip_reset_on_timeout)) { if (hdev->reset_on_lockup) - hl_device_reset(hdev, HL_RESET_TDR); + hl_device_reset(hdev, HL_DRV_RESET_TDR); else hdev->needs_reset = true; } diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 9674e2520532..eb5800b403b6 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -95,7 +95,7 @@ static void hpriv_release(struct kref *ref) if ((hdev->reset_if_device_not_idle && !device_is_idle) || hdev->reset_upon_device_release) - hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE); + hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); /* Now we can mark the compute_ctx as empty. Even if a reset is running in a different * thread, we don't care because the in_reset is marked so if a user will try to open @@ -330,10 +330,10 @@ static void device_hard_reset_pending(struct work_struct *work) u32 flags; int rc; - flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD; + flags = HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_RESET_THR; if (device_reset_work->fw_reset) - flags |= HL_RESET_FW; + flags |= HL_DRV_RESET_BYPASS_REQ_TO_FW; rc = hl_device_reset(hdev, flags); if ((rc == -EBUSY) && !hdev->device_fini_pending) { @@ -541,7 +541,7 @@ static void hl_device_heartbeat(struct work_struct *work) goto reschedule; dev_err(hdev->dev, "Device heartbeat failed!\n"); - hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_HEARTBEAT); + hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); return; @@ -552,7 +552,7 @@ reschedule: * If control reached here, then at least one heartbeat work has been * scheduled since last reset/init cycle. * So if the device is not already in reset cycle, reset the flag - * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR + * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR * status for at least one heartbeat. From this point driver restarts * tracking future consecutive fatal errors. */ @@ -831,7 +831,7 @@ int hl_device_resume(struct hl_device *hdev) hdev->disabled = false; atomic_set(&hdev->in_reset, 0); - rc = hl_device_reset(hdev, HL_RESET_HARD); + rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); if (rc) { dev_err(hdev->dev, "Failed to reset device during resume\n"); goto disable_device; @@ -948,15 +948,15 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) * ('in_reset' makes sure of it). This makes sure that * 'reset_cause' will continue holding its 1st recorded reason! */ - if (flags & HL_RESET_HEARTBEAT) { + if (flags & HL_DRV_RESET_HEARTBEAT) { hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; - cur_reset_trigger = HL_RESET_HEARTBEAT; - } else if (flags & HL_RESET_TDR) { + cur_reset_trigger = HL_DRV_RESET_HEARTBEAT; + } else if (flags & HL_DRV_RESET_TDR) { hdev->curr_reset_cause = HL_RESET_CAUSE_TDR; - cur_reset_trigger = HL_RESET_TDR; - } else if (flags & HL_RESET_FW_FATAL_ERR) { + cur_reset_trigger = HL_DRV_RESET_TDR; + } else if (flags & HL_DRV_RESET_FW_FATAL_ERR) { hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; - cur_reset_trigger = HL_RESET_FW_FATAL_ERR; + cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR; } else { hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; } @@ -979,8 +979,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) * If F/W is performing the reset, no need to send it a message to disable * PCI access */ - if ((flags & HL_RESET_HARD) && - !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) { + if ((flags & HL_DRV_RESET_HARD) && + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { /* Disable PCI access from device F/W so he won't send * us additional interrupts. We disable MSI/MSI-X at * the halt_engines function and we can't have the F/W @@ -1025,9 +1025,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) return 0; } - hard_reset = !!(flags & HL_RESET_HARD); - from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD); - fw_reset = !!(flags & HL_RESET_FW); + hard_reset = !!(flags & HL_DRV_RESET_HARD); + from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); + fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); if (!hard_reset && !hdev->supports_soft_reset) { hard_instead_soft = true; @@ -1035,7 +1035,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) } if (hdev->reset_upon_device_release && - (flags & HL_RESET_DEVICE_RELEASE)) { + (flags & HL_DRV_RESET_DEV_RELEASE)) { dev_dbg(hdev->dev, "Perform %s-reset upon device release\n", hard_reset ? "hard" : "soft"); @@ -1075,7 +1075,7 @@ do_reset: if (hard_reset) dev_info(hdev->dev, "Going to reset device\n"); - else if (flags & HL_RESET_DEVICE_RELEASE) + else if (flags & HL_DRV_RESET_DEV_RELEASE) dev_info(hdev->dev, "Going to reset device after it was released by user\n"); else @@ -1171,7 +1171,7 @@ kill_processes: hdev->hard_reset_pending = false; if (hdev->reset_trigger_repeated && - (hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) { + (hdev->prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR)) { /* if there 2 back to back resets from FW, * ensure driver puts the driver in a unusable state */ diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index dc61f7031c38..92d12c8ba569 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -120,37 +120,37 @@ enum hl_mmu_page_table_location { /* * Reset Flags * - * - HL_RESET_HARD + * - HL_DRV_RESET_HARD * If set do hard reset to all engines. If not set reset just * compute/DMA engines. * - * - HL_RESET_FROM_RESET_THREAD + * - HL_DRV_RESET_FROM_RESET_THR * Set if the caller is the hard-reset thread * - * - HL_RESET_HEARTBEAT + * - HL_DRV_RESET_HEARTBEAT * Set if reset is due to heartbeat * - * - HL_RESET_TDR + * - HL_DRV_RESET_TDR * Set if reset is due to TDR * - * - HL_RESET_DEVICE_RELEASE + * - HL_DRV_RESET_DEV_RELEASE * Set if reset is due to device release * - * - HL_RESET_FW + * - HL_DRV_RESET_BYPASS_REQ_TO_FW * F/W will perform the reset. No need to ask it to reset the device. This is relevant * only when running with secured f/w * - * - HL_RESET_FW_FATAL_ERR + * - HL_DRV_RESET_FW_FATAL_ERR * Set if reset is due to a fatal error from FW */ -#define HL_RESET_HARD (1 << 0) -#define HL_RESET_FROM_RESET_THREAD (1 << 1) -#define HL_RESET_HEARTBEAT (1 << 2) -#define HL_RESET_TDR (1 << 3) -#define HL_RESET_DEVICE_RELEASE (1 << 4) -#define HL_RESET_FW (1 << 5) -#define HL_RESET_FW_FATAL_ERR (1 << 6) +#define HL_DRV_RESET_HARD (1 << 0) +#define HL_DRV_RESET_FROM_RESET_THR (1 << 1) +#define HL_DRV_RESET_HEARTBEAT (1 << 2) +#define HL_DRV_RESET_TDR (1 << 3) +#define HL_DRV_RESET_DEV_RELEASE (1 << 4) +#define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5) +#define HL_DRV_RESET_FW_FATAL_ERR (1 << 6) #define HL_MAX_SOBS_PER_MONITOR 8 diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index cd3640617d02..530f8b4fadd2 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -316,7 +316,7 @@ static int free_phys_pg_pack(struct hl_device *hdev, } if (rc && !hdev->disabled) - hl_device_reset(hdev, HL_RESET_HARD); + hl_device_reset(hdev, HL_DRV_RESET_HARD); end: kvfree(phys_pg_pack->pages); diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 42c1769ad25d..aee0cc4d6155 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -236,7 +236,7 @@ static ssize_t hard_reset_store(struct device *dev, dev_warn(hdev->dev, "Hard-Reset requested through sysfs\n"); - hl_device_reset(hdev, HL_RESET_HARD); + hl_device_reset(hdev, HL_DRV_RESET_HARD); out: return count; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 738ad2498439..2724ab3747f2 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -8003,7 +8003,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR: gaudi_print_irq_info(hdev, event_type, true); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); - fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; goto reset_device; case GAUDI_EVENT_GIC500: @@ -8011,7 +8011,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: gaudi_print_irq_info(hdev, event_type, false); - fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; goto reset_device; case GAUDI_EVENT_HBM0_SPI_0: @@ -8022,7 +8022,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); - fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; goto reset_device; case GAUDI_EVENT_HBM0_SPI_1: @@ -8205,9 +8205,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev, reset_device: if (hdev->asic_prop.fw_security_enabled) - hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag); + hl_device_reset(hdev, HL_DRV_RESET_HARD + | HL_DRV_RESET_BYPASS_REQ_TO_FW + | fw_fatal_err_flag); else if (hdev->hard_reset_on_fw_events) - hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag); + hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag); else hl_fw_unmask_irq(hdev, event_type); } @@ -8260,7 +8262,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, if (rc) { dev_err_ratelimited(hdev->dev, "MMU cache invalidation timeout\n"); - hl_device_reset(hdev, HL_RESET_HARD); + hl_device_reset(hdev, HL_DRV_RESET_HARD); } return rc; diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 959eb21dcc69..3bbcab7da25e 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4838,14 +4838,14 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) case GOYA_ASYNC_EVENT_ID_L2_RAM_ECC: goya_print_irq_info(hdev, event_type, false); if (hdev->hard_reset_on_fw_events) - hl_device_reset(hdev, (HL_RESET_HARD | - HL_RESET_FW_FATAL_ERR)); + hl_device_reset(hdev, (HL_DRV_RESET_HARD | + HL_DRV_RESET_FW_FATAL_ERR)); break; case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET: goya_print_irq_info(hdev, event_type, false); if (hdev->hard_reset_on_fw_events) - hl_device_reset(hdev, HL_RESET_HARD); + hl_device_reset(hdev, HL_DRV_RESET_HARD); break; case GOYA_ASYNC_EVENT_ID_PCIE_DEC: @@ -4905,7 +4905,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) goya_print_irq_info(hdev, event_type, false); goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); if (hdev->hard_reset_on_fw_events) - hl_device_reset(hdev, HL_RESET_HARD); + hl_device_reset(hdev, HL_DRV_RESET_HARD); else hl_fw_unmask_irq(hdev, event_type); break; @@ -5239,7 +5239,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, if (rc) { dev_err_ratelimited(hdev->dev, "MMU cache invalidation timeout\n"); - hl_device_reset(hdev, HL_RESET_HARD); + hl_device_reset(hdev, HL_DRV_RESET_HARD); } return rc; -- cgit v1.2.3 From 48f31169830f589e4c7ac475ccc7414951ded3f0 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Thu, 14 Oct 2021 22:38:41 +0300 Subject: habanalabs: change wait for interrupt timeout to 64 bit In order to increase maximum wait-for-interrupt timeout, change it to 64 bit variable. This wait is used only by newer ASICs, so no problem in changing this interface at this time. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 22 +++++++++++++++++----- include/uapi/misc/habanalabs.h | 18 +++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 9ebcd9894d83..54a5425a77a0 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2765,8 +2765,23 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) return 0; } +static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs) +{ + if (usecs <= U32_MAX) + return usecs_to_jiffies(usecs); + + /* + * If the value in nanoseconds is larger than 64 bit, use the largest + * 64 bit value. + */ + if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC))) + return nsecs_to_jiffies(U64_MAX); + + return nsecs_to_jiffies(usecs * NSEC_PER_USEC); +} + static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, - u32 timeout_us, u64 user_address, + u64 timeout_us, u64 user_address, u64 target_value, u16 interrupt_offset, u32 *status, u64 *timestamp) @@ -2778,10 +2793,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, long completion_rc; int rc = 0; - if (timeout_us == U32_MAX) - timeout = timeout_us; - else - timeout = usecs_to_jiffies(timeout_us); + timeout = hl_usecs64_to_jiffies(timeout_us); hl_ctx_get(hdev, ctx); diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 00b309590499..c5760acebdd1 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -911,14 +911,18 @@ struct hl_wait_cs_in { */ __u32 flags; - /* Multi CS API info- valid entries in multi-CS array */ - __u8 seq_arr_len; - __u8 pad[3]; + union { + struct { + /* Multi CS API info- valid entries in multi-CS array */ + __u8 seq_arr_len; + __u8 pad[7]; + }; - /* Absolute timeout to wait for an interrupt in microseconds. - * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set - */ - __u32 interrupt_timeout_us; + /* Absolute timeout to wait for an interrupt in microseconds. + * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set + */ + __u64 interrupt_timeout_us; + }; }; #define HL_WAIT_CS_STATUS_COMPLETED 0 -- cgit v1.2.3 From 1679c7ee580fdaa2a5df398a526b2eddc857f2a1 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 25 Oct 2021 09:47:04 +0300 Subject: habanalabs: expand clock throttling information uAPI In addition to the clock throttling reason, user should be able to obtain also the start time and the duration of the throttling event. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 3 +++ drivers/misc/habanalabs/common/habanalabs.h | 31 +++++++++++++++++++++-- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 27 ++++++++++++++++++-- drivers/misc/habanalabs/gaudi/gaudi.c | 22 +++++++++++++--- drivers/misc/habanalabs/goya/goya.c | 25 +++++++++++++++--- include/uapi/misc/habanalabs.h | 16 ++++++++++-- 6 files changed, 110 insertions(+), 14 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index eb5800b403b6..0da5a55490ff 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -455,6 +455,7 @@ static int device_early_init(struct hl_device *hdev) INIT_LIST_HEAD(&hdev->fpriv_list); mutex_init(&hdev->fpriv_list_lock); atomic_set(&hdev->in_reset, 0); + mutex_init(&hdev->clk_throttling.lock); return 0; @@ -495,6 +496,8 @@ static void device_early_fini(struct hl_device *hdev) mutex_destroy(&hdev->fpriv_list_lock); + mutex_destroy(&hdev->clk_throttling.lock); + hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr); kfree(hdev->hl_chip_info); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 92d12c8ba569..fc201537f7a9 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2378,6 +2378,32 @@ struct multi_cs_data { u8 update_ts; }; +/** + * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp + * @start: timestamp taken when 'start' event is received in driver + * @end: timestamp taken when 'end' event is received in driver + */ +struct hl_clk_throttle_timestamp { + ktime_t start; + ktime_t end; +}; + +/** + * struct hl_clk_throttle - keeps current/last clock throttling timestamps + * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER + * index 1 refers to THERMAL + * @lock: protects this structure as it can be accessed from both event queue + * context and info_ioctl context + * @current_reason: bitmask represents the current clk throttling reasons + * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load + */ +struct hl_clk_throttle { + struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX]; + struct mutex lock; + u32 current_reason; + u32 aggregated_reason; +}; + /** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. @@ -2445,6 +2471,7 @@ struct multi_cs_data { * @pci_mem_region: array of memory regions in the PCI * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. + * @clk_throttling: holds information about current/previous clock throttling events * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -2474,7 +2501,6 @@ struct multi_cs_data { * @high_pll: high PLL profile frequency. * @soft_reset_cnt: number of soft reset since the driver was loaded. * @hard_reset_cnt: number of hard reset since the driver was loaded. - * @clk_throttling_reason: bitmask represents the current clk throttling reasons * @id: device minor. * @id_control: minor of the control device * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit @@ -2604,6 +2630,8 @@ struct hl_device { struct multi_cs_completion multi_cs_completion[ MULTI_CS_MAX_USER_CTX]; + struct hl_clk_throttle clk_throttling; + u32 *stream_master_qid_arr; atomic64_t dram_used_mem; u64 timeout_jiffies; @@ -2622,7 +2650,6 @@ struct hl_device { u32 high_pll; u32 soft_reset_cnt; u32 hard_reset_cnt; - u32 clk_throttling_reason; u16 id; u16 id_control; u16 cpu_pci_msb_addr; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 86c3257d9ae1..19726c6b642a 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -313,15 +313,38 @@ static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { + void __user *out = (void __user *) (uintptr_t) args->return_pointer; struct hl_device *hdev = hpriv->hdev; struct hl_info_clk_throttle clk_throttle = {0}; + ktime_t end_time, zero_time = ktime_set(0, 0); u32 max_size = args->return_size; - void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int i; if ((!max_size) || (!out)) return -EINVAL; - clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason; + mutex_lock(&hdev->clk_throttling.lock); + + clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason; + + for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) { + if (!(hdev->clk_throttling.aggregated_reason & BIT(i))) + continue; + + clk_throttle.clk_throttling_timestamp_us[i] = + ktime_to_us(hdev->clk_throttling.timestamp[i].start); + + if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time)) + end_time = ktime_get(); + else + end_time = hdev->clk_throttling.timestamp[i].end; + + clk_throttle.clk_throttling_duration_ns[i] = + ktime_to_ns(ktime_sub(end_time, + hdev->clk_throttling.timestamp[i].start)); + + } + mutex_unlock(&hdev->clk_throttling.lock); return copy_to_user(out, &clk_throttle, min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 2724ab3747f2..b4814369062e 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7925,27 +7925,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type) static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type) { + ktime_t zero_time = ktime_set(0, 0); + + mutex_lock(&hdev->clk_throttling.lock); + switch (event_type) { case GAUDI_EVENT_FIX_POWER_ENV_S: - hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time; dev_info_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); break; case GAUDI_EVENT_FIX_POWER_ENV_E: - hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get(); dev_info_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); break; case GAUDI_EVENT_FIX_THERMAL_ENV_S: - hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time; dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); break; case GAUDI_EVENT_FIX_THERMAL_ENV_E: - hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get(); dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); break; @@ -7955,6 +7967,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, event_type); break; } + + mutex_unlock(&hdev->clk_throttling.lock); } static void gaudi_handle_eqe(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 3bbcab7da25e..7b3683f2a6dc 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4768,24 +4768,39 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type) static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type) { + ktime_t zero_time = ktime_set(0, 0); + + mutex_lock(&hdev->clk_throttling.lock); + switch (event_type) { case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S: - hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time; dev_info_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); break; + case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E: - hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get(); dev_info_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); break; + case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S: - hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time; dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); break; + case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E: - hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get(); dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); break; @@ -4795,6 +4810,8 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type) event_type); break; } + + mutex_unlock(&hdev->clk_throttling.lock); } void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index c5760acebdd1..257b9630773e 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -473,15 +473,27 @@ struct hl_info_pci_counters { __u64 replay_cnt; }; -#define HL_CLK_THROTTLE_POWER 0x1 -#define HL_CLK_THROTTLE_THERMAL 0x2 +enum hl_clk_throttling_type { + HL_CLK_THROTTLE_TYPE_POWER, + HL_CLK_THROTTLE_TYPE_THERMAL, + HL_CLK_THROTTLE_TYPE_MAX +}; + +/* clk_throttling_reason masks */ +#define HL_CLK_THROTTLE_POWER (1 << HL_CLK_THROTTLE_TYPE_POWER) +#define HL_CLK_THROTTLE_THERMAL (1 << HL_CLK_THROTTLE_TYPE_THERMAL) /** * struct hl_info_clk_throttle - clock throttling reason * @clk_throttling_reason: each bit represents a clk throttling reason + * @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event + * @clk_throttling_duration_ns: the clock throttle time in nanosec */ struct hl_info_clk_throttle { __u32 clk_throttling_reason; + __u32 pad; + __u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX]; + __u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX]; }; /** -- cgit v1.2.3 From 792512459fb2a62a5ea08264a0cdfb7e46a391a9 Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Wed, 3 Nov 2021 13:15:55 +0200 Subject: habanalabs/gaudi: Fix collective wait bug In Signaling-From-Graph case, the driver didn't set the hw_sob pointer at the right place, which is needed for the cs completion check prior to start sending all the master/slaves jobs to device. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index b4814369062e..a9e279bfebae 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1276,6 +1276,7 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs) container_of(cs->signal_fence, struct hl_cs_compl, base_fence); struct hl_cs_compl *cs_cmpl = container_of(cs->fence, struct hl_cs_compl, base_fence); + struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; struct gaudi_collective_properties *cprop; u32 stream, queue_id, sob_group_offset; struct gaudi_device *gaudi; @@ -1288,10 +1289,16 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs) gaudi = hdev->asic_specific; cprop = &gaudi->collective_props; - /* In encaps signals case the SOB info will be retrieved from - * the handle in gaudi_collective_slave_init_job. - */ - if (!cs->encaps_signals) { + if (cs->encaps_signals) { + cs_cmpl->hw_sob = handle->hw_sob; + /* at this checkpoint we only need the hw_sob pointer + * for the completion check before start going over the jobs + * of the master/slaves, the sob_value will be taken later on + * in gaudi_collective_slave_init_job depends on each + * job wait offset value. + */ + cs_cmpl->sob_val = 0; + } else { /* copy the SOB id and value of the signal CS */ cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; cs_cmpl->sob_val = signal_cs_cmpl->sob_val; -- cgit v1.2.3 From d4194f21400e9b2caef2d48c63ec5ef102eead22 Mon Sep 17 00:00:00 2001 From: Bharat Jauhari Date: Wed, 8 Sep 2021 17:32:54 +0300 Subject: habanalabs: refactor wait-for-user-interrupt function Refactor the wait-for-user-interrupt routine to make it more generic for re-use for other user exposed h/w interfaces in future ASICs. Signed-off-by: Bharat Jauhari Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 54a5425a77a0..e97b21988dea 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2782,12 +2782,12 @@ static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs) static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 user_address, - u64 target_value, u16 interrupt_offset, + u64 target_value, struct hl_user_interrupt *interrupt, + u32 *status, u64 *timestamp) { struct hl_user_pending_interrupt *pend; - struct hl_user_interrupt *interrupt; unsigned long timeout, flags; u64 completion_value; long completion_rc; @@ -2805,11 +2805,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, hl_fence_init(&pend->fence, ULONG_MAX); - if (interrupt_offset == HL_COMMON_USER_INTERRUPT_ID) - interrupt = &hdev->common_user_interrupt; - else - interrupt = &hdev->user_interrupt[interrupt_offset]; - /* Add pending user interrupt to relevant list for the interrupt * handler to monitor */ @@ -2898,9 +2893,10 @@ remove_pending_user_interrupt: static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) { - u16 interrupt_id, interrupt_offset, first_interrupt, last_interrupt; + u16 interrupt_id, first_interrupt, last_interrupt; struct hl_device *hdev = hpriv->hdev; struct asic_fixed_properties *prop; + struct hl_user_interrupt *interrupt; union hl_wait_cs_args *args = data; u32 status = HL_WAIT_CS_STATUS_BUSY; u64 timestamp; @@ -2913,8 +2909,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) return -EPERM; } - interrupt_id = - FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags); + interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags); first_interrupt = prop->first_available_user_msix_interrupt; last_interrupt = prop->first_available_user_msix_interrupt + @@ -2927,15 +2922,14 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) } if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID) - interrupt_offset = HL_COMMON_USER_INTERRUPT_ID; + interrupt = &hdev->common_user_interrupt; else - interrupt_offset = interrupt_id - first_interrupt; + interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt]; rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, args->in.interrupt_timeout_us, args->in.addr, - args->in.target, interrupt_offset, &status, + args->in.target, interrupt, &status, ×tamp); - if (rc) { if (rc != -EINTR) dev_err_ratelimited(hdev->dev, -- cgit v1.2.3 From 49c052dad691ba1a3dc3559b74e99f2ec2fa0319 Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Sun, 24 Oct 2021 19:02:32 +0300 Subject: habanalabs: add new opcodes for INFO IOCTL Add implementation for new opcodes in the INFO IOCTL: 1. Retrieve the replaced DRAM rows from f/w. 2. Retrieve the pending DRAM rows from f/w. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 66 +++++++++++++++++++++++ drivers/misc/habanalabs/common/habanalabs.h | 3 ++ drivers/misc/habanalabs/common/habanalabs_ioctl.c | 43 +++++++++++++++ drivers/misc/habanalabs/include/common/cpucp_if.h | 33 +++++++++++- include/uapi/misc/habanalabs.h | 4 ++ 5 files changed, 148 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 9addcfba6a8b..70e992bdbde7 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -972,6 +972,72 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power) return rc; } +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info) +{ + struct cpucp_hbm_row_info *cpucp_repl_rows_info_cpu_addr; + dma_addr_t cpucp_repl_rows_info_dma_addr; + struct cpucp_packet pkt = {}; + u64 result; + int rc; + + cpucp_repl_rows_info_cpu_addr = + hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, + sizeof(struct cpucp_hbm_row_info), + &cpucp_repl_rows_info_dma_addr); + if (!cpucp_repl_rows_info_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP replaced rows info packet\n"); + return -ENOMEM; + } + + memset(cpucp_repl_rows_info_cpu_addr, 0, sizeof(struct cpucp_hbm_row_info)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(cpucp_repl_rows_info_dma_addr); + pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_hbm_row_info)); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc); + goto out; + } + + memcpy(info, cpucp_repl_rows_info_cpu_addr, sizeof(*info)); + +out: + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, + sizeof(struct cpucp_hbm_row_info), + cpucp_repl_rows_info_cpu_addr); + + return rc; +} + +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_PENDING_ROWS_STATUS << CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP pending rows info pkt, error %d\n", rc); + goto out; + } + + *pend_rows_num = (u32) result; +out: + return rc; +} + void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) { struct static_fw_load_mgr *static_loader = diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index fc201537f7a9..a19563c416ac 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3012,6 +3012,9 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, struct fw_load_mgr *fw_loader, enum comms_cmd cmd, unsigned int size, bool wait_ok, u32 timeout); +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info); +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num); int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], bool is_wc[3]); int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 19726c6b642a..68c655acdec8 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -503,6 +503,43 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args) min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0; } +static int dram_pending_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + u32 pend_rows_num = 0; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_dram_pending_row_get(hdev, &pend_rows_num); + if (rc) + return rc; + + return copy_to_user(out, &pend_rows_num, + min_t(size_t, max_size, sizeof(pend_rows_num))) ? -EFAULT : 0; +} + +static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct cpucp_hbm_row_info info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_dram_replaced_row_get(hdev, &info); + if (rc) + return rc; + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -589,6 +626,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_OPEN_STATS: return open_stats_info(hpriv, args); + case HL_INFO_DRAM_REPLACED_ROWS: + return dram_replaced_rows_info(hpriv, args); + + case HL_INFO_DRAM_PENDING_ROWS: + return dram_pending_rows_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 17927968e19a..5e19c763f3f0 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2020 HabanaLabs, Ltd. + * Copyright 2021 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -377,6 +377,13 @@ enum pq_init_status { * a different engine or QMAN according to enum cpucp_idle_mask. * The bit will be 1 if the engine is NOT idle. * + * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET - + * Fetch all HBM replaced-rows and prending to be replaced rows data. + * + * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS - + * Fetch status of HBM rows pending replacement and need a reboot to + * be replaced. + * * CPUCP_PACKET_POWER_SET - * Resets power history of device to 0 */ @@ -424,6 +431,8 @@ enum cpucp_packet_id { CPUCP_PACKET_NIC_STAT_REGS_CLR, /* internal */ CPUCP_PACKET_NIC_STAT_REGS_ALL_GET, /* internal */ CPUCP_PACKET_IS_IDLE_CHECK, /* internal */ + CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */ + CPUCP_PACKET_HBM_PENDING_ROWS_STATUS, /* internal */ CPUCP_PACKET_POWER_SET, /* internal */ }; @@ -692,6 +701,7 @@ struct eq_generic_event { #define CPUCP_MAX_NIC_LANES (CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC) #define CPUCP_NIC_MASK_ARR_LEN ((CPUCP_MAX_NICS + 63) / 64) #define CPUCP_NIC_POLARITY_ARR_LEN ((CPUCP_MAX_NIC_LANES + 63) / 64) +#define CPUCP_HBM_ROW_REPLACE_MAX 32 struct cpucp_sensor { __le32 type; @@ -837,4 +847,25 @@ struct cpucp_nic_status { __le32 high_ber_cnt; }; +enum cpucp_hbm_row_replace_cause { + REPLACE_CAUSE_DOUBLE_ECC_ERR, + REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR, +}; + +struct cpucp_hbm_row_info { + __u8 hbm_idx; + __u8 pc; + __u8 sid; + __u8 bank_idx; + __le16 row_addr; + __u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */ + __u8 pad; +}; + +struct cpucp_hbm_row_replaced_rows_info { + __le16 num_replaced_rows; + __u8 pad[6]; + struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX]; +}; + #endif /* CPUCP_IF_H */ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 257b9630773e..9b4d72897061 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -334,6 +334,8 @@ enum hl_server_type { * HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency * HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls + * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info + * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -353,6 +355,8 @@ enum hl_server_type { #define HL_INFO_PLL_FREQUENCY 16 #define HL_INFO_POWER 17 #define HL_INFO_OPEN_STATS 18 +#define HL_INFO_DRAM_REPLACED_ROWS 21 +#define HL_INFO_DRAM_PENDING_ROWS 22 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 -- cgit v1.2.3 From e617f5f4c144c3f185da67292dff09dc6cbb3296 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 4 Nov 2021 09:48:22 +0200 Subject: habanalabs: make hdev creation code more readable Divide the code into 3 different parts: - Copy kernel parameters - Setting device behaivor per asic - Fixup of various device parameters according to the device behaivor. In addition, remove non-relevant code for upstream (simulator support). Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 6 +- drivers/misc/habanalabs/common/habanalabs_drv.c | 123 ++++++++++++------------ 2 files changed, 61 insertions(+), 68 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index a19563c416ac..6b33fbd72fd8 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -62,7 +62,6 @@ #define HL_CPUCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */ #define HL_FW_STATUS_POLL_INTERVAL_USEC 10000 /* 10ms */ -#define HL_FW_STATUS_PLDM_POLL_INTERVAL_USEC 300000000 /* 300s */ #define HL_PCI_ELBI_TIMEOUT_MSEC 10 /* 10ms */ @@ -2823,9 +2822,6 @@ bool hl_device_operational(struct hl_device *hdev, enum hl_device_status *status); enum hl_device_status hl_device_status(struct hl_device *hdev); int hl_device_set_debug_mode(struct hl_device *hdev, bool enable); -int create_hdev(struct hl_device **dev, struct pci_dev *pdev, - enum hl_asic_type asic_type, int minor); -void destroy_hdev(struct hl_device *hdev); int hl_hw_queues_create(struct hl_device *hdev); void hl_hw_queues_destroy(struct hl_device *hdev); int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 5989826701bc..85034f2f2e89 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -263,6 +263,7 @@ out_err: static void set_driver_behavior_per_device(struct hl_device *hdev) { + hdev->pldm = 0; hdev->fw_components = FW_TYPE_ALL_TYPES; hdev->cpu_queues_enable = 1; hdev->heartbeat = 1; @@ -279,23 +280,53 @@ static void set_driver_behavior_per_device(struct hl_device *hdev) hdev->axi_drain = 0; } -/* +static void copy_kernel_module_params_to_device(struct hl_device *hdev) +{ + hdev->major = hl_major; + hdev->memory_scrub = memory_scrub; + hdev->reset_on_lockup = reset_on_lockup; + hdev->boot_error_status_mask = boot_error_status_mask; + + if (timeout_locked) + hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000); + else + hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + +} + +static int fixup_device_params(struct hl_device *hdev) +{ + hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type); + + hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; + + hdev->stop_on_err = true; + hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + + /* Enable only after the initialization of the device */ + hdev->disabled = true; + + /* Set default DMA mask to 32 bits */ + hdev->dma_mask = 32; + + return 0; +} + +/** * create_hdev - create habanalabs device instance * * @dev: will hold the pointer to the new habanalabs device structure * @pdev: pointer to the pci device - * @asic_type: in case of simulator device, which device is it - * @minor: in case of simulator device, the minor of the device * * Allocate memory for habanalabs device and initialize basic fields * Identify the ASIC type * Allocate ID (minor) for the device (only for real devices) */ -int create_hdev(struct hl_device **dev, struct pci_dev *pdev, - enum hl_asic_type asic_type, int minor) +static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) { + int main_id, ctrl_id = 0, rc = 0; struct hl_device *hdev; - int rc, main_id, ctrl_id = 0; *dev = NULL; @@ -303,72 +334,39 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, if (!hdev) return -ENOMEM; - /* First, we must find out which ASIC are we handling. This is needed - * to configure the behavior of the driver (kernel parameters) - */ - if (pdev) { - hdev->asic_type = get_asic_type(pdev->device); - if (hdev->asic_type == ASIC_INVALID) { - dev_err(&pdev->dev, "Unsupported ASIC\n"); - rc = -ENODEV; - goto free_hdev; - } - } else { - hdev->asic_type = asic_type; - } - - if (pdev) - hdev->asic_prop.fw_security_enabled = - is_asic_secured(hdev->asic_type); - else - hdev->asic_prop.fw_security_enabled = false; + /* can be NULL in case of simulator device */ + hdev->pdev = pdev; /* Assign status description string */ - strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], - "operational", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], - "in reset", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], - "disabled", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], - "needs reset", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX); strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], "in device creation", HL_STR_MAX); - hdev->major = hl_major; - hdev->reset_on_lockup = reset_on_lockup; - hdev->memory_scrub = memory_scrub; - hdev->boot_error_status_mask = boot_error_status_mask; - hdev->stop_on_err = true; + /* First, we must find out which ASIC are we handling. This is needed + * to configure the behavior of the driver (kernel parameters) + */ + hdev->asic_type = get_asic_type(pdev->device); + if (hdev->asic_type == ASIC_INVALID) { + dev_err(&pdev->dev, "Unsupported ASIC\n"); + rc = -ENODEV; + goto free_hdev; + } - hdev->pldm = 0; + copy_kernel_module_params_to_device(hdev); set_driver_behavior_per_device(hdev); - hdev->fw_poll_interval_usec = hdev->pldm ? HL_FW_STATUS_PLDM_POLL_INTERVAL_USEC : - HL_FW_STATUS_POLL_INTERVAL_USEC; - - hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; - hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; - - if (timeout_locked) - hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000); - else - hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; - - hdev->disabled = true; - hdev->pdev = pdev; /* can be NULL in case of simulator device */ - - /* Set default DMA mask to 32 bits */ - hdev->dma_mask = 32; + fixup_device_params(hdev); mutex_lock(&hl_devs_idr_lock); /* Always save 2 numbers, 1 for main device and 1 for control. * They must be consecutive */ - main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, - GFP_KERNEL); + main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL); if (main_id >= 0) ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1, @@ -408,7 +406,7 @@ free_hdev: * @dev: pointer to the habanalabs device structure * */ -void destroy_hdev(struct hl_device *hdev) +static void destroy_hdev(struct hl_device *hdev) { /* Remove device from the device list */ mutex_lock(&hl_devs_idr_lock); @@ -447,7 +445,7 @@ static int hl_pmops_resume(struct device *dev) return hl_device_resume(hdev); } -/* +/** * hl_pci_probe - probe PCI habanalabs devices * * @pdev: pointer to pci device @@ -457,8 +455,7 @@ static int hl_pmops_resume(struct device *dev) * Create a new habanalabs device and initialize it according to the * device's type */ -static int hl_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) +static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct hl_device *hdev; int rc; @@ -467,7 +464,7 @@ static int hl_pci_probe(struct pci_dev *pdev, " device found [%04x:%04x] (rev %x)\n", (int)pdev->vendor, (int)pdev->device, (int)pdev->revision); - rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1); + rc = create_hdev(&hdev, pdev); if (rc) return rc; -- cgit v1.2.3 From 3eb7754ff43827294bebcb2760969e9dc2283027 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 12 Oct 2021 20:52:46 +0300 Subject: habanalabs: debugfs support for larger I2C transactions I2C debugfs support is limited to 1 byte. We extend functionality to more than 1 byte by using one of the pad fields as a length. No backward compatibility issues as new F/W versions will treat 0 length as a 1 byte length transaction. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../ABI/testing/debugfs-driver-habanalabs | 7 +++ drivers/misc/habanalabs/common/debugfs.c | 50 ++++++++++++++-------- drivers/misc/habanalabs/common/habanalabs.h | 2 + drivers/misc/habanalabs/include/common/cpucp_if.h | 9 +++- 4 files changed, 50 insertions(+), 18 deletions(-) (limited to 'drivers/misc') diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs index 63c46d9d538f..6085ee506135 100644 --- a/Documentation/ABI/testing/debugfs-driver-habanalabs +++ b/Documentation/ABI/testing/debugfs-driver-habanalabs @@ -155,6 +155,13 @@ Description: Triggers an I2C transaction that is generated by the device's CPU. Writing to this file generates a write transaction while reading from the file generates a read transaction +What: /sys/kernel/debug/habanalabs/hl/i2c_len +Date: Dec 2021 +KernelVersion: 5.17 +Contact: obitton@habana.ai +Description: Sets I2C length in bytes for I2C transaction that is generated by + the device's CPU + What: /sys/kernel/debug/habanalabs/hl/i2c_reg Date: Jan 2019 KernelVersion: 5.1 diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index a239c5679f95..9727d82b121f 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -15,19 +15,25 @@ #define MMU_ADDR_BUF_SIZE 40 #define MMU_ASID_BUF_SIZE 10 #define MMU_KBUF_SIZE (MMU_ADDR_BUF_SIZE + MMU_ASID_BUF_SIZE) +#define I2C_MAX_TRANSACTION_LEN 8 static struct dentry *hl_debug_root; static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, - u8 i2c_reg, long *val) + u8 i2c_reg, u8 i2c_len, u64 *val) { struct cpucp_packet pkt; - u64 result; int rc; if (!hl_device_operational(hdev, NULL)) return -EBUSY; + if (i2c_len > I2C_MAX_TRANSACTION_LEN) { + dev_err(hdev->dev, "I2C transaction length %u, exceeds maximum of %u\n", + i2c_len, I2C_MAX_TRANSACTION_LEN); + return -EINVAL; + } + memset(&pkt, 0, sizeof(pkt)); pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD << @@ -35,12 +41,10 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, pkt.i2c_bus = i2c_bus; pkt.i2c_addr = i2c_addr; pkt.i2c_reg = i2c_reg; + pkt.i2c_len = i2c_len; rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), - 0, &result); - - *val = (long) result; - + 0, val); if (rc) dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc); @@ -48,7 +52,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, } static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, - u8 i2c_reg, u32 val) + u8 i2c_reg, u8 i2c_len, u64 val) { struct cpucp_packet pkt; int rc; @@ -56,6 +60,12 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, if (!hl_device_operational(hdev, NULL)) return -EBUSY; + if (i2c_len > I2C_MAX_TRANSACTION_LEN) { + dev_err(hdev->dev, "I2C transaction length %u, exceeds maximum of %u\n", + i2c_len, I2C_MAX_TRANSACTION_LEN); + return -EINVAL; + } + memset(&pkt, 0, sizeof(pkt)); pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR << @@ -63,6 +73,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, pkt.i2c_bus = i2c_bus; pkt.i2c_addr = i2c_addr; pkt.i2c_reg = i2c_reg; + pkt.i2c_len = i2c_len; pkt.value = cpu_to_le64(val); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), @@ -899,22 +910,22 @@ static ssize_t hl_i2c_data_read(struct file *f, char __user *buf, struct hl_dbg_device_entry *entry = file_inode(f)->i_private; struct hl_device *hdev = entry->hdev; char tmp_buf[32]; - long val; + u64 val; ssize_t rc; if (*ppos) return 0; rc = hl_debugfs_i2c_read(hdev, entry->i2c_bus, entry->i2c_addr, - entry->i2c_reg, &val); + entry->i2c_reg, entry->i2c_len, &val); if (rc) { dev_err(hdev->dev, - "Failed to read from I2C bus %d, addr %d, reg %d\n", - entry->i2c_bus, entry->i2c_addr, entry->i2c_reg); + "Failed to read from I2C bus %d, addr %d, reg %d, len %d\n", + entry->i2c_bus, entry->i2c_addr, entry->i2c_reg, entry->i2c_len); return rc; } - sprintf(tmp_buf, "0x%02lx\n", val); + sprintf(tmp_buf, "%#02llx\n", val); rc = simple_read_from_buffer(buf, count, ppos, tmp_buf, strlen(tmp_buf)); @@ -926,19 +937,19 @@ static ssize_t hl_i2c_data_write(struct file *f, const char __user *buf, { struct hl_dbg_device_entry *entry = file_inode(f)->i_private; struct hl_device *hdev = entry->hdev; - u32 value; + u64 value; ssize_t rc; - rc = kstrtouint_from_user(buf, count, 16, &value); + rc = kstrtou64_from_user(buf, count, 16, &value); if (rc) return rc; rc = hl_debugfs_i2c_write(hdev, entry->i2c_bus, entry->i2c_addr, - entry->i2c_reg, value); + entry->i2c_reg, entry->i2c_len, value); if (rc) { dev_err(hdev->dev, - "Failed to write 0x%02x to I2C bus %d, addr %d, reg %d\n", - value, entry->i2c_bus, entry->i2c_addr, entry->i2c_reg); + "Failed to write %#02llx to I2C bus %d, addr %d, reg %d, len %d\n", + value, entry->i2c_bus, entry->i2c_addr, entry->i2c_reg, entry->i2c_len); return rc; } @@ -1421,6 +1432,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry->root, &dev_entry->i2c_reg); + debugfs_create_u8("i2c_len", + 0644, + dev_entry->root, + &dev_entry->i2c_len); + debugfs_create_file("i2c_data", 0644, dev_entry->root, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 6b33fbd72fd8..9aa144d2fe40 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1889,6 +1889,7 @@ struct hl_debugfs_entry { * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read. * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read. * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read. + * @i2c_len: generic u8 debugfs file for length value to use in i2c_data_read. */ struct hl_dbg_device_entry { struct dentry *root; @@ -1917,6 +1918,7 @@ struct hl_dbg_device_entry { u8 i2c_bus; u8 i2c_addr; u8 i2c_reg; + u8 i2c_len; }; /** diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 5e19c763f3f0..078fb4bd0316 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -493,7 +493,14 @@ struct cpucp_packet { __u8 i2c_bus; __u8 i2c_addr; __u8 i2c_reg; - __u8 pad; /* unused */ + /* + * In legacy implemetations, i2c_len was not present, + * was unused and just added as pad. + * So if i2c_len is 0, it is treated as legacy + * and r/w 1 Byte, else if i2c_len is specified, + * its treated as new multibyte r/w support. + */ + __u8 i2c_len; }; struct {/* For PLL info fetch */ -- cgit v1.2.3 From e2637fdca70aa5357b26c57e44fcec0ed673eb22 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 10 Nov 2021 11:41:43 +0200 Subject: habanalabs: handle device TPM boot error as warning AS TPM error indication is not fatal, driver should dump a warning and continue booting. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 9 +++++++++ drivers/misc/habanalabs/include/common/hl_boot_if.h | 4 ++++ 2 files changed, 13 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 70e992bdbde7..aea5904332fd 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -529,6 +529,15 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_exists = true; } + if (err_val & CPU_BOOT_ERR0_TPM_FAIL) { + dev_warn(hdev->dev, + "Device boot warning - TPM failure\n"); + /* This is a warning so we don't want it to disable the + * device + */ + err_val &= ~CPU_BOOT_ERR0_TPM_FAIL; + } + /* return error only if it's in the predefined mask */ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) & lower_32_bits(hdev->boot_error_status_mask))) diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h index 2626df6ef3ef..135e21d6edc9 100644 --- a/drivers/misc/habanalabs/include/common/hl_boot_if.h +++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h @@ -32,6 +32,7 @@ enum cpu_boot_err { CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL = 13, CPU_BOOT_ERR_BOOT_FW_CRIT_ERR = 18, CPU_BOOT_ERR_BINNING_FAIL = 19, + CPU_BOOT_ERR_TPM_FAIL = 20, CPU_BOOT_ERR_ENABLED = 31, CPU_BOOT_ERR_SCND_EN = 63, CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */ @@ -108,6 +109,8 @@ enum cpu_boot_err { * malfunctioning components might still be * in use. * + * CPU_BOOT_ERR0_TPM_FAIL TPM verification flow failed. + * * CPU_BOOT_ERR0_ENABLED Error registers enabled. * This is a main indication that the * running FW populates the error @@ -130,6 +133,7 @@ enum cpu_boot_err { #define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) #define CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR (1 << CPU_BOOT_ERR_BOOT_FW_CRIT_ERR) #define CPU_BOOT_ERR0_BINNING_FAIL (1 << CPU_BOOT_ERR_BINNING_FAIL) +#define CPU_BOOT_ERR0_TPM_FAIL (1 << CPU_BOOT_ERR_TPM_FAIL) #define CPU_BOOT_ERR0_ENABLED (1 << CPU_BOOT_ERR_ENABLED) #define CPU_BOOT_ERR1_ENABLED (1 << CPU_BOOT_ERR_ENABLED) -- cgit v1.2.3 From 3e55b5dbf929a40966b8eb7d4de94fad3bb404bd Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Wed, 3 Nov 2021 10:09:59 +0200 Subject: habanalabs: add support for fetching historic errors A new uAPI is added for debug purposes of the user-space to retrieve errors related data from previous session (before device reset was performed). Inforamtion is filled when a razwi or CS timeout happens and can contain one of the following: 1. Retrieve timestamp of last time the device was opened and razwi or CS timeout happened. 2. Retrieve information about last CS timeout. 3. Retrieve information about last razwi error. This information doesn't contain user data, so no danger of data leakage between users. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 8 + drivers/misc/habanalabs/common/habanalabs.h | 37 +++++ drivers/misc/habanalabs/common/habanalabs_drv.c | 4 + drivers/misc/habanalabs/common/habanalabs_ioctl.c | 60 ++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 167 +++++++++++++++------ include/uapi/misc/habanalabs.h | 58 ++++++- 6 files changed, 290 insertions(+), 44 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index e97b21988dea..c1fd4ba14c60 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -733,6 +733,14 @@ static void cs_timedout(struct work_struct *work) hdev = cs->ctx->hdev; + /* Save only the first CS timeout parameters */ + rc = atomic_cmpxchg(&hdev->last_error.cs_write_disable, 0, 1); + if (!rc) { + hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime; + hdev->last_error.cs_timeout_timestamp = ktime_get(); + hdev->last_error.cs_timeout_seq = cs->sequence; + } + switch (cs->type) { case CS_TYPE_SIGNAL: dev_err(hdev->dev, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 9aa144d2fe40..612a9f461b38 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2405,6 +2405,40 @@ struct hl_clk_throttle { u32 aggregated_reason; }; +/** + * struct last_error_session_info - info about last session in which CS timeout or + * razwi error occurred. + * @open_dev_timestamp: device open timestamp. + * @cs_timeout_timestamp: CS timeout timestamp. + * @razwi_timestamp: razwi timestamp. + * @cs_write_disable: if set writing to CS parameters in the structure is disabled so the + * first (root cause) CS timeout will not be overwritten. + * @razwi_write_disable: if set writing to razwi parameters in the structure is disabled so the + * first (root cause) razwi will not be overwritten. + * @cs_timeout_seq: CS timeout sequence number. + * @razwi_addr: address that caused razwi. + * @razwi_engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does + * not have engine id it will be set to U16_MAX. + * @razwi_engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible + * engines which one them caused the razwi. In that case, it will contain the + * second possible engine id, otherwise it will be set to U16_MAX. + * @razwi_non_engine_initiator: in case the initiator of the razwi does not have engine id. + * @razwi_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. + */ +struct last_error_session_info { + ktime_t open_dev_timestamp; + ktime_t cs_timeout_timestamp; + ktime_t razwi_timestamp; + atomic_t cs_write_disable; + atomic_t razwi_write_disable; + u64 cs_timeout_seq; + u64 razwi_addr; + u16 razwi_engine_id_1; + u16 razwi_engine_id_2; + u8 razwi_non_engine_initiator; + u8 razwi_type; +}; + /** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. @@ -2488,6 +2522,7 @@ struct hl_clk_throttle { * device initialization. Mainly used to debug and * workaround firmware bugs * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM. + * @last_successful_open_ktime: timestamp (ktime) of the last successful device open. * @last_successful_open_jif: timestamp (jiffies) of the last successful * device open. * @last_open_session_duration_jif: duration (jiffies) of the last device open @@ -2632,6 +2667,7 @@ struct hl_device { struct multi_cs_completion multi_cs_completion[ MULTI_CS_MAX_USER_CTX]; struct hl_clk_throttle clk_throttling; + struct last_error_session_info last_error; u32 *stream_master_qid_arr; atomic64_t dram_used_mem; @@ -2645,6 +2681,7 @@ struct hl_device { u64 open_counter; u64 fw_poll_interval_usec; atomic_t in_reset; + ktime_t last_successful_open_ktime; enum hl_pll_frequency curr_pll_profile; enum cpucp_card_types card_type; u32 major; diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 85034f2f2e89..1070c80d739c 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -187,8 +187,12 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); + atomic_set(&hdev->last_error.cs_write_disable, 0); + atomic_set(&hdev->last_error.razwi_write_disable, 0); + hdev->open_counter++; hdev->last_successful_open_jif = jiffies; + hdev->last_successful_open_ktime = ktime_get(); return 0; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 68c655acdec8..360a1e9bbd5d 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -540,6 +540,57 @@ static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args * return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; } +static int last_err_open_dev_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_info_last_err_open_dev_time info = {0}; + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.timestamp = ktime_to_ns(hdev->last_error.open_dev_timestamp); + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_info_cs_timeout_event info = {0}; + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.seq = hdev->last_error.cs_timeout_seq; + info.timestamp = ktime_to_ns(hdev->last_error.cs_timeout_timestamp); + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_info_razwi_event info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.timestamp = ktime_to_ns(hdev->last_error.razwi_timestamp); + info.addr = hdev->last_error.razwi_addr; + info.engine_id_1 = hdev->last_error.razwi_engine_id_1; + info.engine_id_2 = hdev->last_error.razwi_engine_id_2; + info.no_engine_id = hdev->last_error.razwi_non_engine_initiator; + info.error_type = hdev->last_error.razwi_type; + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -632,6 +683,15 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DRAM_PENDING_ROWS: return dram_pending_rows_info(hpriv, args); + case HL_INFO_LAST_ERR_OPEN_DEV_TIME: + return last_err_open_dev_info(hpriv, args); + + case HL_INFO_CS_TIMEOUT_EVENT: + return cs_timeout_info(hpriv, args); + + case HL_INFO_RAZWI_EVENT: + return razwi_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index a9e279bfebae..aed55db368d7 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6970,8 +6970,9 @@ event_not_supported: snprintf(desc, size, "N/A"); } -static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, - u32 x_y, bool is_write) +static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y, + bool is_write, s32 *engine_id_1, + s32 *engine_id_2) { u32 dma_id[2], dma_offset, err_cause[2], mask, i; @@ -7011,44 +7012,64 @@ static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, switch (x_y) { case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_0; return "DMA0"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_2; return "DMA2"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_0; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_2; return "DMA0 or DMA2"; + } case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_1; return "DMA1"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_3; return "DMA3"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_1; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_3; return "DMA1 or DMA3"; + } case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_4; return "DMA4"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_6; return "DMA6"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_4; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_6; return "DMA4 or DMA6"; + } case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_5; return "DMA5"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_7; return "DMA7"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_5; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_7; return "DMA5 or DMA7"; + } } unknown_initiator: return "unknown initiator"; } -static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, - bool is_write) +static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write, + u32 *engine_id_1, u32 *engine_id_2) { u32 val, x_y, axi_id; @@ -7061,24 +7082,35 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, switch (x_y) { case RAZWI_INITIATOR_ID_X_Y_TPC0_NIC0: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_0; return "TPC0"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_0; return "NIC0"; + } break; case RAZWI_INITIATOR_ID_X_Y_TPC1: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_1; return "TPC1"; case RAZWI_INITIATOR_ID_X_Y_MME0_0: case RAZWI_INITIATOR_ID_X_Y_MME0_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_0; return "MME0"; case RAZWI_INITIATOR_ID_X_Y_MME1_0: case RAZWI_INITIATOR_ID_X_Y_MME1_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_1; return "MME1"; case RAZWI_INITIATOR_ID_X_Y_TPC2: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_2; return "TPC2"; case RAZWI_INITIATOR_ID_X_Y_TPC3_PCI_CPU_PSOC: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_3; return "TPC3"; + } + /* PCI, CPU or PSOC does not have engine id*/ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PCI)) return "PCI"; if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_CPU)) @@ -7094,32 +7126,49 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: - return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write); + return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write, + engine_id_1, engine_id_2); case RAZWI_INITIATOR_ID_X_Y_TPC4_NIC1_NIC2: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_4; return "TPC4"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_1; return "NIC1"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_2; return "NIC2"; + } break; case RAZWI_INITIATOR_ID_X_Y_TPC5: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_5; return "TPC5"; case RAZWI_INITIATOR_ID_X_Y_MME2_0: case RAZWI_INITIATOR_ID_X_Y_MME2_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_2; return "MME2"; case RAZWI_INITIATOR_ID_X_Y_MME3_0: case RAZWI_INITIATOR_ID_X_Y_MME3_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_3; return "MME3"; case RAZWI_INITIATOR_ID_X_Y_TPC6: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_6; return "TPC6"; case RAZWI_INITIATOR_ID_X_Y_TPC7_NIC4_NIC5: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_7; return "TPC7"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_4; return "NIC4"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_5; return "NIC5"; + } break; default: break; @@ -7136,27 +7185,28 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, return "unknown initiator"; } -static void gaudi_print_razwi_info(struct hl_device *hdev) +static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1, + u32 *engine_id_2) { + if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) { dev_err_ratelimited(hdev->dev, "RAZWI event caused by illegal write of %s\n", - gaudi_get_razwi_initiator_name(hdev, true)); + gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0); } if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) { dev_err_ratelimited(hdev->dev, "RAZWI event caused by illegal read of %s\n", - gaudi_get_razwi_initiator_name(hdev, false)); + gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_READ_VLD, 0); } } -static void gaudi_print_mmu_error_info(struct hl_device *hdev) +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type) { struct gaudi_device *gaudi = hdev->asic_specific; - u64 addr; u32 val; if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) @@ -7164,24 +7214,24 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev) val = RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE); if (val & MMU_UP_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) { - addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK; - addr <<= 32; - addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); + *addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK; + *addr <<= 32; + *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); - dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", - addr); + dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); + *type = HL_RAZWI_PAGE_FAULT; WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } val = RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE); if (val & MMU_UP_ACCESS_ERROR_CAPTURE_ENTRY_VALID_MASK) { - addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK; - addr <<= 32; - addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); + *addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK; + *addr <<= 32; + *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); - dev_err_ratelimited(hdev->dev, - "MMU access error on va 0x%llx\n", addr); + dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr); + *type = HL_RAZWI_MMU_ACCESS_ERROR; WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0); } @@ -7700,15 +7750,46 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type) static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, bool razwi) { + u32 engine_id_1, engine_id_2; char desc[64] = ""; + u64 razwi_addr = 0; + u8 razwi_type; + int rc; + + /* + * Init engine id by default as not valid and only if razwi initiated from engine with + * engine id it will get valid value. + * Init razwi type to default, will be changed only if razwi caused by page fault of + * MMU access error + */ + engine_id_1 = U16_MAX; + engine_id_2 = U16_MAX; + razwi_type = U8_MAX; gaudi_get_event_desc(event_type, desc, sizeof(desc)); dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); if (razwi) { - gaudi_print_razwi_info(hdev); - gaudi_print_mmu_error_info(hdev); + gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type); + + /* In case it's the first razwi, save its parameters*/ + rc = atomic_cmpxchg(&hdev->last_error.razwi_write_disable, 0, 1); + if (!rc) { + hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime; + hdev->last_error.razwi_timestamp = ktime_get(); + hdev->last_error.razwi_addr = razwi_addr; + hdev->last_error.razwi_engine_id_1 = engine_id_1; + hdev->last_error.razwi_engine_id_2 = engine_id_2; + /* + * If first engine id holds non valid value the razwi initiator + * does not have engine id + */ + hdev->last_error.razwi_non_engine_initiator = (engine_id_1 == U16_MAX); + hdev->last_error.razwi_type = razwi_type; + + } } } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 9b4d72897061..eb8565fdae70 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -336,6 +336,14 @@ enum hl_server_type { * HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num + * HL_INFO_LAST_ERR_OPEN_DEV_TIME - Retrieve timestamp of the last time the device was opened + * and CS timeout or razwi error occurred. + * HL_INFO_CS_TIMEOUT_EVENT - Retrieve CS timeout timestamp and its related CS sequence number. + * HL_INFO_RAZWI_EVENT - Retrieve parameters of razwi: + * Timestamp of razwi. + * The address which accessing it caused the razwi. + * Razwi initiator. + * Razwi cause, was it a page fault or MMU access error. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -357,8 +365,11 @@ enum hl_server_type { #define HL_INFO_OPEN_STATS 18 #define HL_INFO_DRAM_REPLACED_ROWS 21 #define HL_INFO_DRAM_PENDING_ROWS 22 +#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23 +#define HL_INFO_CS_TIMEOUT_EVENT 24 +#define HL_INFO_RAZWI_EVENT 25 -#define HL_INFO_VERSION_MAX_LEN 128 +#define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 /** @@ -575,6 +586,51 @@ struct hl_info_cs_counters { __u64 ctx_validation_drop_cnt; }; +/** + * struct hl_info_last_err_open_dev_time - last error boot information. + * @timestamp: timestamp of last time the device was opened and error occurred. + */ +struct hl_info_last_err_open_dev_time { + __s64 timestamp; +}; + +/** + * struct hl_info_cs_timeout_event - last CS timeout information. + * @timestamp: timestamp when last CS timeout event occurred. + * @seq: sequence number of last CS timeout event. + */ +struct hl_info_cs_timeout_event { + __s64 timestamp; + __u64 seq; +}; + +#define HL_RAZWI_PAGE_FAULT 0 +#define HL_RAZWI_MMU_ACCESS_ERROR 1 + +/** + * struct hl_info_razwi_event - razwi information. + * @timestamp: timestamp of razwi. + * @addr: address which accessing it caused razwi. + * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not + * have engine id it will be set to U16_MAX. + * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible + * engines which one them caused the razwi. In that case, it will contain the + * second possible engine id, otherwise it will be set to U16_MAX. + * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1, + * otherwise 0. + * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. + * @pad: padding to 64 bit. + */ +struct hl_info_razwi_event { + __s64 timestamp; + __u64 addr; + __u16 engine_id_1; + __u16 engine_id_2; + __u8 no_engine_id; + __u8 error_type; + __u8 pad[2]; +}; + enum gaudi_dcores { HL_GAUDI_WS_DCORE, HL_GAUDI_WN_DCORE, -- cgit v1.2.3 From fe8d70873c4919086d5929c49e1c6cd6bb7d1de3 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 13 Nov 2021 17:58:43 +0200 Subject: habanalabs: prevent false heartbeat message If a device reset has started, there is a chance that the heartbeat function will fail because the device is disabled at the beginning of the reset function. In that case, we don't want the error message to appear in the log. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 0da5a55490ff..ca74d7815a67 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -543,7 +543,9 @@ static void hl_device_heartbeat(struct work_struct *work) if (!hdev->asic_funcs->send_heartbeat(hdev)) goto reschedule; - dev_err(hdev->dev, "Device heartbeat failed!\n"); + if (hl_device_operational(hdev, NULL)) + dev_err(hdev->dev, "Device heartbeat failed!\n"); + hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); return; -- cgit v1.2.3 From 6f61e47a68b403f8aa7956b4b6502511fcf19bb7 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 14 Nov 2021 09:37:33 +0200 Subject: habanalabs: skip PLL freq fetch Getting the used PLL index with which to send the CPUPU packet relies on the CPUCP info packet. In case CPU queues are not enabled getting the PLL index will issue an error and in some ASICs will also fail the driver load. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 5 +++++ drivers/misc/habanalabs/goya/goya.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index aed55db368d7..465540d064b6 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -881,6 +881,11 @@ static int gaudi_fetch_psoc_frequency(struct hl_device *hdev) int rc; if (hdev->asic_prop.fw_security_enabled) { + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + rc = hl_fw_cpucp_pll_info_get(hdev, HL_GAUDI_CPU_PLL, pll_freq_arr); if (rc) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 7b3683f2a6dc..2347de2f426a 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -739,6 +739,11 @@ static void goya_fetch_psoc_frequency(struct hl_device *hdev) int rc; if (hdev->asic_prop.fw_security_enabled) { + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) + return; + rc = hl_fw_cpucp_pll_info_get(hdev, HL_GOYA_PCI_PLL, pll_freq_arr); -- cgit v1.2.3 From a1b838adb080ee4320f257a8280821e47bfb9a1f Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 9 Nov 2021 13:12:38 +0200 Subject: habanalabs: fix possible deadlock in cache invl failure Currently there is a deadlock in driver in scenarios where MMU cache invalidation fails. The issue is basically device reset being performed without releasing the MMU mutex. The solution is to skip device reset as it is not necessary. In addition we introduce a slight code refactor that prints the invalidation error from a single location. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 9 ++++----- drivers/misc/habanalabs/common/habanalabs.h | 3 +++ drivers/misc/habanalabs/common/memory.c | 25 +++++++------------------ drivers/misc/habanalabs/common/mmu/mmu.c | 25 +++++++++++++++++++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 6 ------ drivers/misc/habanalabs/goya/goya.c | 6 ------ 6 files changed, 39 insertions(+), 35 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 71910f7809bd..c591f0487272 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -80,14 +80,13 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) offset += va_block->size; } - hdev->asic_funcs->mmu_invalidate_cache(hdev, false, - MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); + rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); mutex_unlock(&ctx->mmu_lock); cb->is_mmu_mapped = true; - return 0; + return rc; err_va_umap: list_for_each_entry(va_block, &cb->va_block_list, node) { @@ -98,7 +97,7 @@ err_va_umap: offset -= va_block->size; } - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + rc = hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); @@ -127,7 +126,7 @@ static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb) "Failed to unmap CB's va 0x%llx\n", va_block->start); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 612a9f461b38..406ca50f192a 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2993,6 +2993,9 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 size); int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size); +int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags); +int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, + u32 flags, u32 asid, u64 va, u64 size); void hl_mmu_swap_out(struct hl_ctx *ctx); void hl_mmu_swap_in(struct hl_ctx *ctx); int hl_mmu_if_set_funcs(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 530f8b4fadd2..315594e96dcd 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1201,18 +1201,13 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, goto map_err; } - rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, false, - *vm_type | MMU_OP_SKIP_LOW_CACHE_INV, - ctx->asid, ret_vaddr, phys_pg_pack->total_size); + rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV, + ctx->asid, ret_vaddr, phys_pg_pack->total_size); mutex_unlock(&ctx->mmu_lock); - if (rc) { - dev_err(hdev->dev, - "mapping handle %u failed due to MMU cache invalidation\n", - handle); + if (rc) goto map_err; - } ret_vaddr += phys_pg_pack->offset; @@ -1350,9 +1345,8 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, * at the loop end rather than for each iteration */ if (!ctx_free) - rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, true, - *vm_type, ctx->asid, vaddr, - phys_pg_pack->total_size); + rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr, + phys_pg_pack->total_size); mutex_unlock(&ctx->mmu_lock); @@ -1365,11 +1359,6 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, if (!ctx_free) { int tmp_rc; - if (rc) - dev_err(hdev->dev, - "unmapping vaddr 0x%llx failed due to MMU cache invalidation\n", - vaddr); - tmp_rc = add_va_block(hdev, va_range, vaddr, vaddr + phys_pg_pack->total_size - 1); if (tmp_rc) { @@ -2640,8 +2629,8 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) mutex_lock(&ctx->mmu_lock); /* invalidate the cache once after the unmapping loop */ - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK); + hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK); mutex_unlock(&ctx->mmu_lock); diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c index aa96917f62e5..9153a1f55175 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu.c +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -637,3 +637,28 @@ u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr) { return addr; } + +int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) +{ + int rc; + + rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags); + if (rc) + dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n"); + + return rc; +} + +int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, + u32 flags, u32 asid, u64 va, u64 size) +{ + int rc; + + rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags, + asid, va, size); + if (rc) + dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n"); + + return rc; +} + diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 465540d064b6..b101a46076b8 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -8366,12 +8366,6 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, WREG32(mmSTLB_INV_SET, 0); - if (rc) { - dev_err_ratelimited(hdev->dev, - "MMU cache invalidation timeout\n"); - hl_device_reset(hdev, HL_DRV_RESET_HARD); - } - return rc; } diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 2347de2f426a..5e6998d21adb 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -5258,12 +5258,6 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, 1000, timeout_usec); - if (rc) { - dev_err_ratelimited(hdev->dev, - "MMU cache invalidation timeout\n"); - hl_device_reset(hdev, HL_DRV_RESET_HARD); - } - return rc; } -- cgit v1.2.3 From ab440d3e39f61018b1f4c1c6bed6ab037f69a82e Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 15 Nov 2021 17:13:37 +0200 Subject: habanalabs: abort reset on invalid request Hard-reset is mutually exclusive with reset-on-device-release. Therefore, if such a request arrives to the reset function, abort the reset and return an error to the callee. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index ca74d7815a67..a3d5617da64c 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -1020,8 +1020,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) */ int hl_device_reset(struct hl_device *hdev, u32 flags) { - u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false; + u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; int i, rc; if (!hdev->init_done) { @@ -1039,11 +1039,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) hard_reset = true; } - if (hdev->reset_upon_device_release && - (flags & HL_DRV_RESET_DEV_RELEASE)) { - dev_dbg(hdev->dev, - "Perform %s-reset upon device release\n", - hard_reset ? "hard" : "soft"); + if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) { + if (hard_reset) { + dev_crit(hdev->dev, + "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n"); + return -EINVAL; + } + goto do_reset; } -- cgit v1.2.3 From d8eb50f31cc7b0f01e610327376a49ac3f0865a2 Mon Sep 17 00:00:00 2001 From: Rajaravi Krishna Katta Date: Thu, 5 Aug 2021 10:24:16 +0300 Subject: habanalabs: Move frequency change thread to goya_late_init Changing the frequency automatically is only done in Goya. In future ASICs this is done inside the firmware. Therefore, move the common code into the Goya specific files. Main changes as part of the commit are: 1. The thread for setting frequency is moved from device_late_init to goya_late_init 2. hl_device_set_frequency is removed from hl_device_open as it is not relevant for other ASICs and for Goya it is taken care by the thread 3. hl_device_set_frequency is renamed as goya_set_frequency Signed-off-by: Rajaravi Krishna Katta Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 58 -------------------- drivers/misc/habanalabs/common/habanalabs.h | 7 --- drivers/misc/habanalabs/common/habanalabs_drv.c | 7 --- drivers/misc/habanalabs/common/sysfs.c | 5 -- drivers/misc/habanalabs/gaudi/gaudi.c | 2 + drivers/misc/habanalabs/goya/goya.c | 73 +++++++++++++++++++++++++ drivers/misc/habanalabs/goya/goyaP.h | 10 ++++ drivers/misc/habanalabs/goya/goya_hwmgr.c | 27 +++++---- 8 files changed, 100 insertions(+), 89 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index a3d5617da64c..484e0446381e 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -516,22 +516,6 @@ static void device_early_fini(struct hl_device *hdev) hdev->asic_funcs->early_fini(hdev); } -static void set_freq_to_low_job(struct work_struct *work) -{ - struct hl_device *hdev = container_of(work, struct hl_device, - work_freq.work); - - mutex_lock(&hdev->fpriv_list_lock); - - if (!hdev->compute_ctx) - hl_device_set_frequency(hdev, PLL_LOW); - - mutex_unlock(&hdev->fpriv_list_lock); - - schedule_delayed_work(&hdev->work_freq, - usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC)); -} - static void hl_device_heartbeat(struct work_struct *work) { struct hl_device *hdev = container_of(work, struct hl_device, @@ -591,18 +575,6 @@ static int device_late_init(struct hl_device *hdev) hdev->high_pll = hdev->asic_prop.high_pll; - /* force setting to low frequency */ - hdev->curr_pll_profile = PLL_LOW; - - if (hdev->pm_mng_profile == PM_AUTO) - hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW); - else - hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST); - - INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job); - schedule_delayed_work(&hdev->work_freq, - usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC)); - if (hdev->heartbeat) { INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); schedule_delayed_work(&hdev->work_heartbeat, @@ -625,7 +597,6 @@ static void device_late_fini(struct hl_device *hdev) if (!hdev->late_init_done) return; - cancel_delayed_work_sync(&hdev->work_freq); if (hdev->heartbeat) cancel_delayed_work_sync(&hdev->work_heartbeat); @@ -655,35 +626,6 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization) return 0; } -/* - * hl_device_set_frequency - set the frequency of the device - * - * @hdev: pointer to habanalabs device structure - * @freq: the new frequency value - * - * Change the frequency if needed. This function has no protection against - * concurrency, therefore it is assumed that the calling function has protected - * itself against the case of calling this function from multiple threads with - * different values - * - * Returns 0 if no change was done, otherwise returns 1 - */ -int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq) -{ - if ((hdev->pm_mng_profile == PM_MANUAL) || - (hdev->curr_pll_profile == freq)) - return 0; - - dev_dbg(hdev->dev, "Changing device frequency to %s\n", - freq == PLL_HIGH ? "high" : "low"); - - hdev->asic_funcs->set_pll_profile(hdev, freq); - - hdev->curr_pll_profile = freq; - - return 1; -} - int hl_device_set_debug_mode(struct hl_device *hdev, bool enable) { int rc = 0; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 406ca50f192a..1a7f8d37f684 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2450,7 +2450,6 @@ struct last_error_session_info { * @cdev_ctrl: char device for control operations only (INFO IOCTL) * @dev: related kernel basic device structure. * @dev_ctrl: related kernel device structure for the control device - * @work_freq: delayed work to lower device frequency if possible. * @work_heartbeat: delayed work for CPU-CP is-alive check. * @device_reset_work: delayed work which performs hard reset * @asic_name: ASIC specific name. @@ -2485,7 +2484,6 @@ struct last_error_session_info { * @asic_specific: ASIC specific information to use only from ASIC files. * @vm: virtual memory manager for MMU. * @hwmon_dev: H/W monitor device. - * @pm_mng_profile: current power management profile. * @hl_chip_info: ASIC's sensors information. * @device_status_description: device status description. * @hl_debugfs: device's debugfs manager. @@ -2530,7 +2528,6 @@ struct last_error_session_info { * @open_counter: number of successful device open operations. * @fw_poll_interval_usec: FW status poll interval in usec. * @in_reset: is device in reset flow. - * @curr_pll_profile: current PLL profile. * @card_type: Various ASICs have several card types. This indicates the card * type of the current device. * @major: habanalabs kernel driver major. @@ -2604,7 +2601,6 @@ struct hl_device { struct cdev cdev_ctrl; struct device *dev; struct device *dev_ctrl; - struct delayed_work work_freq; struct delayed_work work_heartbeat; struct hl_device_reset_work device_reset_work; char asic_name[HL_STR_MAX]; @@ -2635,7 +2631,6 @@ struct hl_device { void *asic_specific; struct hl_vm vm; struct device *hwmon_dev; - enum hl_pm_mng_profile pm_mng_profile; struct hwmon_chip_info *hl_chip_info; struct hl_dbg_device_entry hl_debugfs; @@ -2682,7 +2677,6 @@ struct hl_device { u64 fw_poll_interval_usec; atomic_t in_reset; ktime_t last_successful_open_ktime; - enum hl_pll_frequency curr_pll_profile; enum cpucp_card_types card_type; u32 major; u32 high_pll; @@ -2912,7 +2906,6 @@ int hl_device_resume(struct hl_device *hdev); int hl_device_reset(struct hl_device *hdev, u32 flags); void hl_hpriv_get(struct hl_fpriv *hpriv); int hl_hpriv_put(struct hl_fpriv *hpriv); -int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq); int hl_device_utilization(struct hl_device *hdev, u32 *utilization); int hl_build_hwmon_channel_info(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 1070c80d739c..d4ef99952d15 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -175,13 +175,6 @@ int hl_device_open(struct inode *inode, struct file *filp) goto out_err; } - /* Device is IDLE at this point so it is legal to change PLLs. - * There is no need to check anything because if the PLL is - * already HIGH, the set function will return without doing - * anything - */ - hl_device_set_frequency(hdev, PLL_HIGH); - list_add(&hpriv->dev_node, &hdev->fpriv_list); mutex_unlock(&hdev->fpriv_list_lock); diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index aee0cc4d6155..15e4ae65e515 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -449,11 +449,6 @@ int hl_sysfs_init(struct hl_device *hdev) { int rc; - if (hdev->asic_type == ASIC_GOYA) - hdev->pm_mng_profile = PM_AUTO; - else - hdev->pm_mng_profile = PM_MANUAL; - hdev->max_power = hdev->asic_prop.max_power_default; hdev->asic_funcs->add_device_attr(hdev, &hl_dev_clks_attr_group); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index b101a46076b8..f29afcca74fc 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1636,6 +1636,8 @@ static int gaudi_late_init(struct hl_device *hdev) */ gaudi_mmu_prepare(hdev, 1); + hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST); + return 0; disable_pci_access: diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 5e6998d21adb..bbee6739ce87 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -787,9 +787,59 @@ static void goya_fetch_psoc_frequency(struct hl_device *hdev) prop->psoc_pci_pll_div_factor = div_fctr; } +/* + * goya_set_frequency - set the frequency of the device + * + * @hdev: pointer to habanalabs device structure + * @freq: the new frequency value + * + * Change the frequency if needed. This function has no protection against + * concurrency, therefore it is assumed that the calling function has protected + * itself against the case of calling this function from multiple threads with + * different values + * + * Returns 0 if no change was done, otherwise returns 1 + */ +int goya_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq) +{ + struct goya_device *goya = hdev->asic_specific; + + if ((goya->pm_mng_profile == PM_MANUAL) || + (goya->curr_pll_profile == freq)) + return 0; + + dev_dbg(hdev->dev, "Changing device frequency to %s\n", + freq == PLL_HIGH ? "high" : "low"); + + goya_set_pll_profile(hdev, freq); + + goya->curr_pll_profile = freq; + + return 1; +} + +static void goya_set_freq_to_low_job(struct work_struct *work) +{ + struct goya_work_freq *goya_work = container_of(work, + struct goya_work_freq, + work_freq.work); + struct hl_device *hdev = goya_work->hdev; + + mutex_lock(&hdev->fpriv_list_lock); + + if (!hdev->compute_ctx) + goya_set_frequency(hdev, PLL_LOW); + + mutex_unlock(&hdev->fpriv_list_lock); + + schedule_delayed_work(&goya_work->work_freq, + usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC)); +} + int goya_late_init(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + struct goya_device *goya = hdev->asic_specific; int rc; goya_fetch_psoc_frequency(hdev); @@ -838,6 +888,16 @@ int goya_late_init(struct hl_device *hdev) return rc; } + /* force setting to low frequency */ + goya->curr_pll_profile = PLL_LOW; + + goya->pm_mng_profile = PM_AUTO; + + hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW); + + schedule_delayed_work(&goya->goya_work->work_freq, + usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC)); + return 0; } @@ -851,8 +911,11 @@ int goya_late_init(struct hl_device *hdev) void goya_late_fini(struct hl_device *hdev) { const struct hwmon_channel_info **channel_info_arr; + struct goya_device *goya = hdev->asic_specific; int i = 0; + cancel_delayed_work_sync(&goya->goya_work->work_freq); + if (!hdev->hl_chip_info->info) return; @@ -976,6 +1039,15 @@ static int goya_sw_init(struct hl_device *hdev) hdev->asic_funcs->set_pci_memory_regions(hdev); + goya->goya_work = kmalloc(sizeof(struct goya_work_freq), GFP_KERNEL); + if (!goya->goya_work) { + rc = -ENOMEM; + goto free_cpu_accessible_dma_pool; + } + + goya->goya_work->hdev = hdev; + INIT_DELAYED_WORK(&goya->goya_work->work_freq, goya_set_freq_to_low_job); + return 0; free_cpu_accessible_dma_pool: @@ -1012,6 +1084,7 @@ static int goya_sw_fini(struct hl_device *hdev) dma_pool_destroy(hdev->dma_pool); + kfree(goya->goya_work); kfree(goya); return 0; diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 97add7b04f82..f0c3c6df04d5 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -153,9 +153,15 @@ #define HW_CAP_GOLDEN 0x00000400 #define HW_CAP_TPC 0x00000800 +struct goya_work_freq { + struct hl_device *hdev; + struct delayed_work work_freq; +}; + struct goya_device { /* TODO: remove hw_queues_lock after moving to scheduler code */ spinlock_t hw_queues_lock; + struct goya_work_freq *goya_work; u64 mme_clk; u64 tpc_clk; @@ -166,6 +172,9 @@ struct goya_device { u32 events_stat_aggregate[GOYA_ASYNC_EVENT_ID_SIZE]; u32 hw_cap_initialized; u8 device_cpu_mmu_mappings_done; + + enum hl_pll_frequency curr_pll_profile; + enum hl_pm_mng_profile pm_mng_profile; }; int goya_set_fixed_properties(struct hl_device *hdev); @@ -237,5 +246,6 @@ void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev); u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx); u64 goya_get_device_time(struct hl_device *hdev); +int goya_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq); #endif /* GOYAP_H_ */ diff --git a/drivers/misc/habanalabs/goya/goya_hwmgr.c b/drivers/misc/habanalabs/goya/goya_hwmgr.c index 59b2624ff81a..42985a85b625 100644 --- a/drivers/misc/habanalabs/goya/goya_hwmgr.c +++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c @@ -62,7 +62,7 @@ static ssize_t mme_clk_store(struct device *dev, struct device_attribute *attr, goto fail; } - if (hdev->pm_mng_profile == PM_AUTO) { + if (goya->pm_mng_profile == PM_AUTO) { count = -EPERM; goto fail; } @@ -111,7 +111,7 @@ static ssize_t tpc_clk_store(struct device *dev, struct device_attribute *attr, goto fail; } - if (hdev->pm_mng_profile == PM_AUTO) { + if (goya->pm_mng_profile == PM_AUTO) { count = -EPERM; goto fail; } @@ -160,7 +160,7 @@ static ssize_t ic_clk_store(struct device *dev, struct device_attribute *attr, goto fail; } - if (hdev->pm_mng_profile == PM_AUTO) { + if (goya->pm_mng_profile == PM_AUTO) { count = -EPERM; goto fail; } @@ -234,13 +234,14 @@ static ssize_t pm_mng_profile_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hl_device *hdev = dev_get_drvdata(dev); + struct goya_device *goya = hdev->asic_specific; if (!hl_device_operational(hdev, NULL)) return -ENODEV; return sprintf(buf, "%s\n", - (hdev->pm_mng_profile == PM_AUTO) ? "auto" : - (hdev->pm_mng_profile == PM_MANUAL) ? "manual" : + (goya->pm_mng_profile == PM_AUTO) ? "auto" : + (goya->pm_mng_profile == PM_MANUAL) ? "manual" : "unknown"); } @@ -248,6 +249,7 @@ static ssize_t pm_mng_profile_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hl_device *hdev = dev_get_drvdata(dev); + struct goya_device *goya = hdev->asic_specific; if (!hl_device_operational(hdev, NULL)) { count = -ENODEV; @@ -265,26 +267,27 @@ static ssize_t pm_mng_profile_store(struct device *dev, if (strncmp("auto", buf, strlen("auto")) == 0) { /* Make sure we are in LOW PLL when changing modes */ - if (hdev->pm_mng_profile == PM_MANUAL) { - hdev->curr_pll_profile = PLL_HIGH; - hdev->pm_mng_profile = PM_AUTO; - hl_device_set_frequency(hdev, PLL_LOW); + if (goya->pm_mng_profile == PM_MANUAL) { + goya->curr_pll_profile = PLL_HIGH; + goya->pm_mng_profile = PM_AUTO; + goya_set_frequency(hdev, PLL_LOW); } } else if (strncmp("manual", buf, strlen("manual")) == 0) { - if (hdev->pm_mng_profile == PM_AUTO) { + if (goya->pm_mng_profile == PM_AUTO) { /* Must release the lock because the work thread also * takes this lock. But before we release it, set * the mode to manual so nothing will change if a user * suddenly opens the device */ - hdev->pm_mng_profile = PM_MANUAL; + goya->pm_mng_profile = PM_MANUAL; mutex_unlock(&hdev->fpriv_list_lock); /* Flush the current work so we can return to the user * knowing that he is the only one changing frequencies */ - flush_delayed_work(&hdev->work_freq); + if (goya->goya_work) + flush_delayed_work(&goya->goya_work->work_freq); return count; } -- cgit v1.2.3 From 60e0431f41fff930537b4292c711200da87b195f Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 16 Nov 2021 09:46:02 +0200 Subject: habanalabs: fix soft reset accounting Reset upon device release is not a soft-reset from user/system point of view. As such, we shouldn't count that reset in the statistics we gather and expose to the monitoring applications. We also shouldn't print soft-reset when doing the reset upon device release. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 50 ++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 484e0446381e..2b208007c26f 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) */ int hl_device_reset(struct hl_device *hdev, u32 flags) { - bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false; + bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, + reset_upon_device_release = false; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; int i, rc; if (!hdev->init_done) { - dev_err(hdev->dev, - "Can't reset before initialization is done\n"); + dev_err(hdev->dev, "Can't reset before initialization is done\n"); return 0; } @@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) return -EINVAL; } + reset_upon_device_release = true; + goto do_reset; } @@ -1024,12 +1026,10 @@ do_reset: if (hard_reset) dev_info(hdev->dev, "Going to reset device\n"); - else if (flags & HL_DRV_RESET_DEV_RELEASE) - dev_info(hdev->dev, - "Going to reset device after it was released by user\n"); + else if (reset_upon_device_release) + dev_info(hdev->dev, "Going to reset device after release by user\n"); else - dev_info(hdev->dev, - "Going to reset compute engines of inference device\n"); + dev_info(hdev->dev, "Going to reset engines of inference device\n"); } again: @@ -1174,16 +1174,14 @@ kill_processes: rc = hdev->asic_funcs->hw_init(hdev); if (rc) { - dev_err(hdev->dev, - "failed to initialize the H/W after reset\n"); + dev_err(hdev->dev, "failed to initialize the H/W after reset\n"); goto out_err; } /* If device is not idle fail the reset process */ if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { - dev_err(hdev->dev, - "device is not idle (mask 0x%llx_%llx) after reset\n", + dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n", idle_mask[1], idle_mask[0]); rc = -EIO; goto out_err; @@ -1192,23 +1190,20 @@ kill_processes: /* Check that the communication with the device is working */ rc = hdev->asic_funcs->test_queues(hdev); if (rc) { - dev_err(hdev->dev, - "Failed to detect if device is alive after reset\n"); + dev_err(hdev->dev, "Failed to detect if device is alive after reset\n"); goto out_err; } if (hard_reset) { rc = device_late_init(hdev); if (rc) { - dev_err(hdev->dev, - "Failed late init after hard reset\n"); + dev_err(hdev->dev, "Failed late init after hard reset\n"); goto out_err; } rc = hl_vm_init(hdev); if (rc) { - dev_err(hdev->dev, - "Failed to init memory module after hard reset\n"); + dev_err(hdev->dev, "Failed to init memory module after hard reset\n"); goto out_err; } @@ -1216,8 +1211,11 @@ kill_processes: } else { rc = hdev->asic_funcs->soft_reset_late_init(hdev); if (rc) { - dev_err(hdev->dev, - "Failed late init after soft reset\n"); + if (reset_upon_device_release) + dev_err(hdev->dev, + "Failed late init in reset after device release\n"); + else + dev_err(hdev->dev, "Failed late init after soft reset\n"); goto out_err; } } @@ -1236,7 +1234,7 @@ kill_processes: * the device will be operational although it shouldn't be */ hdev->asic_funcs->enable_events_from_fw(hdev); - } else { + } else if (!reset_upon_device_release) { hdev->soft_reset_cnt++; } @@ -1246,12 +1244,14 @@ out_err: hdev->disabled = true; if (hard_reset) { - dev_err(hdev->dev, - "Failed to reset! Device is NOT usable\n"); + dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n"); hdev->hard_reset_cnt++; + } else if (reset_upon_device_release) { + dev_err(hdev->dev, "Failed to reset device after user release\n"); + hard_reset = true; + goto again; } else { - dev_err(hdev->dev, - "Failed to do soft-reset, trying hard reset\n"); + dev_err(hdev->dev, "Failed to do soft-reset\n"); hdev->soft_reset_cnt++; hard_reset = true; goto again; -- cgit v1.2.3 From 6c1bad35e691d908785e20258027d29c8b8beb08 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 16 Nov 2021 09:59:32 +0200 Subject: habanalabs: rename late init after reset function The ASIC-specific soft_reset_late_init() is now called after either soft-reset or reset-upon-device-release. Therefore, it needs a more appropriate name. No need to split it to two functions, as an ASIC either supports soft-reset or reset-upon-device-release. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 2 +- drivers/misc/habanalabs/common/habanalabs.h | 4 ++-- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/goya/goya.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 2b208007c26f..822d9cec5aaf 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1209,7 +1209,7 @@ kill_processes: hl_set_max_power(hdev); } else { - rc = hdev->asic_funcs->soft_reset_late_init(hdev); + rc = hdev->asic_funcs->non_hard_reset_late_init(hdev); if (rc) { if (reset_upon_device_release) dev_err(hdev->dev, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 1a7f8d37f684..a465b4a5f31d 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1153,7 +1153,7 @@ struct fw_load_mgr { * @disable_clock_gating: disable clock gating completely * @debug_coresight: perform certain actions on Coresight for debugging. * @is_device_idle: return true if device is idle, false otherwise. - * @soft_reset_late_init: perform certain actions needed after soft reset. + * @non_hard_reset_late_init: perform certain actions needed after a reset which is not hard-reset * @hw_queues_lock: acquire H/W queues lock. * @hw_queues_unlock: release H/W queues lock. * @get_pci_id: retrieve PCI ID. @@ -1289,7 +1289,7 @@ struct hl_asic_funcs { int (*debug_coresight)(struct hl_device *hdev, void *data); bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, struct seq_file *s); - int (*soft_reset_late_init)(struct hl_device *hdev); + int (*non_hard_reset_late_init)(struct hl_device *hdev); void (*hw_queues_lock)(struct hl_device *hdev); void (*hw_queues_unlock)(struct hl_device *hdev); u32 (*get_pci_id)(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index f29afcca74fc..464d205a26ed 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7819,7 +7819,7 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev, fw_alive->thread_id, fw_alive->uptime_seconds); } -static int gaudi_soft_reset_late_init(struct hl_device *hdev) +static int gaudi_non_hard_reset_late_init(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; @@ -9591,7 +9591,7 @@ static const struct hl_asic_funcs gaudi_funcs = { .disable_clock_gating = gaudi_disable_clock_gating, .debug_coresight = gaudi_debug_coresight, .is_device_idle = gaudi_is_device_idle, - .soft_reset_late_init = gaudi_soft_reset_late_init, + .non_hard_reset_late_init = gaudi_non_hard_reset_late_init, .hw_queues_lock = gaudi_hw_queues_lock, .hw_queues_unlock = gaudi_hw_queues_unlock, .get_pci_id = gaudi_get_pci_id, diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index bbee6739ce87..e54d60e75854 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4813,7 +4813,7 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, return rc; } -static int goya_soft_reset_late_init(struct hl_device *hdev) +static int goya_non_hard_reset_late_init(struct hl_device *hdev) { /* * Unmask all IRQs since some could have been received @@ -5738,7 +5738,7 @@ static const struct hl_asic_funcs goya_funcs = { .disable_clock_gating = goya_disable_clock_gating, .debug_coresight = goya_debug_coresight, .is_device_idle = goya_is_device_idle, - .soft_reset_late_init = goya_soft_reset_late_init, + .non_hard_reset_late_init = goya_non_hard_reset_late_init, .hw_queues_lock = goya_hw_queues_lock, .hw_queues_unlock = goya_hw_queues_unlock, .get_pci_id = goya_get_pci_id, -- cgit v1.2.3 From 9eade72e7246a25b8a13678d52a947033d6de710 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 16 Nov 2021 10:30:26 +0200 Subject: habanalabs/gaudi: return EPERM on non hard-reset GAUDI supports only hard-reset. Therefore, this function should return an error of operation not permitted. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 464d205a26ed..07e03d44930e 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7821,12 +7821,8 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev, static int gaudi_non_hard_reset_late_init(struct hl_device *hdev) { - struct gaudi_device *gaudi = hdev->asic_specific; - - /* Unmask all IRQs since some could have been received - * during the soft reset - */ - return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events)); + /* GAUDI doesn't support any reset except hard-reset */ + return -EPERM; } static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device, -- cgit v1.2.3 From cad9eb4a8d9f745c2548f905534f981758e2afec Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 16 Nov 2021 15:48:42 +0200 Subject: habanalabs: move device boot warnings to the correct location As device boot warnings clears the indication from the error mask, they must be located together before the unknown error validation. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 45 ++++++++++++++-------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index aea5904332fd..cf67800f2b47 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -443,15 +443,6 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_exists = true; } - if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { - dev_warn(hdev->dev, - "Device boot warning - Skipped DRAM initialization\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED; - } - if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { if (hdev->bmc_enable) { dev_err(hdev->dev, @@ -495,15 +486,6 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_exists = true; } - if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - Failed to load preboot primary image\n"); - /* This is a warning so we don't want it to disable the - * device as we have a secondary preboot image - */ - err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL; - } - if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) { dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n"); err_exists = true; @@ -523,10 +505,23 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); - if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) { - dev_err(hdev->dev, - "Device boot error - unknown ERR0 error 0x%08x\n", err_val); - err_exists = true; + /* All warnings should go here in order not to reach the unknown error validation */ + if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { + dev_warn(hdev->dev, + "Device boot warning - Skipped DRAM initialization\n"); + /* This is a warning so we don't want it to disable the + * device + */ + err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED; + } + + if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) { + dev_warn(hdev->dev, + "Device boot warning - Failed to load preboot primary image\n"); + /* This is a warning so we don't want it to disable the + * device as we have a secondary preboot image + */ + err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL; } if (err_val & CPU_BOOT_ERR0_TPM_FAIL) { @@ -538,6 +533,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_val &= ~CPU_BOOT_ERR0_TPM_FAIL; } + if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) { + dev_err(hdev->dev, + "Device boot error - unknown ERR0 error 0x%08x\n", err_val); + err_exists = true; + } + /* return error only if it's in the predefined mask */ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) & lower_32_bits(hdev->boot_error_status_mask))) -- cgit v1.2.3 From 3beaf903a3a07dc5c6500691b0b465d36292e3f8 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Wed, 17 Nov 2021 09:59:10 +0200 Subject: habanalabs: fix race condition in multi CS completion Race example scenario: 1. User have 2 threads that waits on multi CS: - thread_0 waits on QID 0 and uses multi CS context 0. - thread_1 waits on QID 1 and uses multi CS context 1. 2. thread_1 got completion and release multi CS context 1. 3. CS related to multi CS of thread_0 starts executing complete_multi_cs function, the first iteration of the loop completes the multi CS of thread_0, hence multi CS context 0 is released. 4. thread_1 waits on QID 1 and uses multi CS context 0. 5. thread_0 waits on QID 0 and uses multi CS context 1. 6. The second iterattion of the loop (from step 3) starts, which means, start checking multi CS context 1: - multi CS contetxt is being used by thread_0 waiting on QID 0. - The fence of the CS (still CS from step 3) has QID map the same as the multi CS context 1. - multi CS context 1 (thread_0) gets completion on CS that triggered already thread_0 (with multi CS context 0) and is no longer being waited on. Fixed by exiting the loop in complete_multi_cs after getting completion Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index c1fd4ba14c60..4e893364a3cc 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -545,6 +545,13 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) * mcs fences. */ fence->mcs_handling_done = true; + /* + * Since CS (and its related fence) can be associated with only one + * multi CS context, once it triggered multi CS completion no need to + * continue checking other multi CS contexts. + */ + spin_unlock(&mcs_compl->lock); + break; } spin_unlock(&mcs_compl->lock); -- cgit v1.2.3 From 411943344599d1a3340b4f720157cd24f4768c92 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 15 Nov 2021 19:36:25 +0200 Subject: habanalabs: add more info ioctls support during reset Some info ioctls can be served even if the device is disabled or in reset. Hence, we enable more info ioctls during reset, as these ioctls do not require any H/W nor F/W communication. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 55 +++++++++++------------ 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 360a1e9bbd5d..15797d55b4e8 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -614,6 +614,33 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_RESET_COUNT: return get_reset_count(hdev, args); + case HL_INFO_HW_EVENTS: + return hw_events_info(hdev, false, args); + + case HL_INFO_HW_EVENTS_AGGREGATE: + return hw_events_info(hdev, true, args); + + case HL_INFO_CS_COUNTERS: + return cs_counters_info(hpriv, args); + + case HL_INFO_CLK_THROTTLE_REASON: + return clk_throttle_info(hpriv, args); + + case HL_INFO_SYNC_MANAGER: + return sync_manager_info(hpriv, args); + + case HL_INFO_OPEN_STATS: + return open_stats_info(hpriv, args); + + case HL_INFO_LAST_ERR_OPEN_DEV_TIME: + return last_err_open_dev_info(hpriv, args); + + case HL_INFO_CS_TIMEOUT_EVENT: + return cs_timeout_info(hpriv, args); + + case HL_INFO_RAZWI_EVENT: + return razwi_info(hpriv, args); + default: break; } @@ -626,10 +653,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, } switch (args->op) { - case HL_INFO_HW_EVENTS: - rc = hw_events_info(hdev, false, args); - break; - case HL_INFO_DRAM_USAGE: rc = dram_usage_info(hpriv, args); break; @@ -642,10 +665,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, rc = device_utilization(hdev, args); break; - case HL_INFO_HW_EVENTS_AGGREGATE: - rc = hw_events_info(hdev, true, args); - break; - case HL_INFO_CLK_RATE: rc = get_clk_rate(hdev, args); break; @@ -653,18 +672,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_TIME_SYNC: return time_sync_info(hdev, args); - case HL_INFO_CS_COUNTERS: - return cs_counters_info(hpriv, args); - case HL_INFO_PCI_COUNTERS: return pci_counters_info(hpriv, args); - case HL_INFO_CLK_THROTTLE_REASON: - return clk_throttle_info(hpriv, args); - - case HL_INFO_SYNC_MANAGER: - return sync_manager_info(hpriv, args); - case HL_INFO_TOTAL_ENERGY: return total_energy_consumption_info(hpriv, args); @@ -674,8 +684,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_POWER: return power_info(hpriv, args); - case HL_INFO_OPEN_STATS: - return open_stats_info(hpriv, args); case HL_INFO_DRAM_REPLACED_ROWS: return dram_replaced_rows_info(hpriv, args); @@ -683,15 +691,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DRAM_PENDING_ROWS: return dram_pending_rows_info(hpriv, args); - case HL_INFO_LAST_ERR_OPEN_DEV_TIME: - return last_err_open_dev_info(hpriv, args); - - case HL_INFO_CS_TIMEOUT_EVENT: - return cs_timeout_info(hpriv, args); - - case HL_INFO_RAZWI_EVENT: - return razwi_info(hpriv, args); - default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; -- cgit v1.2.3 From 75a5c44d143bc1818e8004a8bee6993aba3a75cf Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 18 Nov 2021 10:44:05 +0200 Subject: habanalabs: add power information type to POWER_GET packet In new f/w versions, it is required to explicitly indicate the power information type when querying the F/W for power info. When getting the current power level it should be set to power_input. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 1 + include/uapi/misc/habanalabs.h | 1 + 2 files changed, 2 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index cf67800f2b47..ac5bd017d294 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -969,6 +969,7 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power) pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.type = cpu_to_le16(CPUCP_POWER_INPUT); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index eb8565fdae70..cd86937c572d 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -333,6 +333,7 @@ enum hl_server_type { * HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore * HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency + * HL_INFO_POWER - Retrieve power information * HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num -- cgit v1.2.3 From b13bef204158e0c9d8a9149d134b260cec7ff6a9 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 18 Nov 2021 08:46:15 +0200 Subject: habanalabs: change misleading IRQ warning during reset Currently we dump the physical IRQ line index in host if an event is received during reset. This ID is confusing as it means nothing to the user. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/irq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c index 96d82b682674..9fd4c18e274e 100644 --- a/drivers/misc/habanalabs/common/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -246,9 +246,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) dma_rmb(); if (hdev->disabled) { - dev_warn(hdev->dev, - "Device disabled but received IRQ %d for EQ\n", - irq); + dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); goto skip_irq; } -- cgit v1.2.3 From 3416d4b59b8fbf0ad360353da4fa0f7293831230 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 21 Nov 2021 16:02:32 +0200 Subject: habanalabs: handle events during soft-reset Driver should handle events during soft-reset as F/W is not going through reset and it keeps sending events towards host. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 4 ++++ drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/common/irq.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 822d9cec5aaf..720eea0b7e9c 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1019,6 +1019,8 @@ do_reset: handle_reset_trigger(hdev, flags); + hdev->is_in_soft_reset = !hard_reset; + /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; @@ -1171,6 +1173,7 @@ kill_processes: * is required for the initialization itself */ hdev->disabled = false; + hdev->is_in_soft_reset = false; rc = hdev->asic_funcs->hw_init(hdev); if (rc) { @@ -1242,6 +1245,7 @@ kill_processes: out_err: hdev->disabled = true; + hdev->is_in_soft_reset = false; if (hard_reset) { dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n"); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index a465b4a5f31d..c2129c9fe9e4 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2591,6 +2591,7 @@ struct last_error_session_info { * protocol will throw an error. Relevant only for * cases where Linux was not loaded to device CPU * @supports_wait_for_multi_cs: true if wait for multi CS is supported + * @is_in_soft_reset: Device is currently in soft reset process. */ struct hl_device { struct pci_dev *pdev; @@ -2719,6 +2720,7 @@ struct hl_device { u8 device_cpu_is_halted; u8 supports_wait_for_multi_cs; u8 stream_master_qid_arr_size; + u8 is_in_soft_reset; /* Parameters for bring-up */ u64 nic_ports_mask; diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c index 9fd4c18e274e..64e0d9de21bd 100644 --- a/drivers/misc/habanalabs/common/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -245,7 +245,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) */ dma_rmb(); - if (hdev->disabled) { + if (hdev->disabled && !hdev->is_in_soft_reset) { dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); goto skip_irq; } -- cgit v1.2.3 From 4fac990f604e6c10538026835a8a30f3c1b6fcf5 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Mon, 22 Nov 2021 12:23:51 +0200 Subject: habanalabs: skip read fw errors if dynamic descriptor invalid Reporting FW errors involves reading of the error registers. In case we have a corrupted FW descriptor we cannot do that since the dynamic scratchpad is potentially corrupted as well and may cause kernel crush when attempting access to a corrupted register offset. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 17 +++++++++++++++-- drivers/misc/habanalabs/common/habanalabs.h | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index ac5bd017d294..76741898d922 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1772,6 +1772,9 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev, return rc; } + /* here we can mark the descriptor as valid as the content has been validated */ + fw_loader->dynamic_loader.fw_desc_valid = true; + return 0; } @@ -1828,7 +1831,13 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, return rc; } - /* extract address copy the descriptor from */ + /* + * extract address to copy the descriptor from + * in addition, as the descriptor value is going to be over-ridden by new data- we mark it + * as invalid. + * it will be marked again as valid once validated + */ + fw_loader->dynamic_loader.fw_desc_valid = false; src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + response->ram_offset; memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc)); @@ -2317,6 +2326,9 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, dev_info(hdev->dev, "Loading firmware to device, may take some time...\n"); + /* initialize FW descriptor as invalid */ + fw_loader->dynamic_loader.fw_desc_valid = false; + /* * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values! * It will be updated from FW after hl_fw_dynamic_request_descriptor(). @@ -2412,7 +2424,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, return 0; protocol_err: - fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), + if (fw_loader->dynamic_loader.fw_desc_valid) + fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), le32_to_cpu(dyn_regs->cpu_boot_err1), le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index c2129c9fe9e4..77ac4bb98137 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1034,6 +1034,7 @@ struct fw_response { * @image_region: region to copy the FW image to * @fw_image_size: size of FW image to load * @wait_for_bl_timeout: timeout for waiting for boot loader to respond + * @fw_desc_valid: true if FW descriptor has been validated and hence the data can be used */ struct dynamic_fw_load_mgr { struct fw_response response; @@ -1041,6 +1042,7 @@ struct dynamic_fw_load_mgr { struct pci_mem_region *image_region; size_t fw_image_size; u32 wait_for_bl_timeout; + bool fw_desc_valid; }; /** -- cgit v1.2.3 From 1880f7acd7e0edacbd46385036253801ddc4273f Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Tue, 9 Nov 2021 11:33:28 +0200 Subject: habanalabs: add SOB information to signal submission uAPI For debug purpose, add SOB address and SOB initial counter value before current submission to uAPI output. Using SOB address and initial counter, user can calculate how much of the submmision has been completed. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 37 ++++++++++++++++++---- drivers/misc/habanalabs/common/habanalabs.h | 5 +++ drivers/misc/habanalabs/common/hw_queue.c | 3 ++ include/uapi/misc/habanalabs.h | 10 +++++- 4 files changed, 47 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 4e893364a3cc..7a277f442207 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -1277,7 +1277,8 @@ static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid) static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, u32 num_chunks, u64 *cs_seq, u32 flags, - u32 encaps_signals_handle, u32 timeout) + u32 encaps_signals_handle, u32 timeout, + u16 *signal_initial_sob_count) { bool staged_mid, int_queues_only = true; struct hl_device *hdev = hpriv->hdev; @@ -1444,6 +1445,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, goto free_cs_object; } + *signal_initial_sob_count = cs->initial_sob_count; + rc = HL_CS_STATUS_SUCCESS; goto put_cs; @@ -1472,6 +1475,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args, int rc = 0, do_ctx_switch; void __user *chunks; u32 num_chunks, tmp; + u16 sob_count; int ret; do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0); @@ -1512,7 +1516,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args, rc = 0; } else { rc = cs_ioctl_default(hpriv, chunks, num_chunks, - cs_seq, 0, 0, hdev->timeout_jiffies); + cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count); } mutex_unlock(&hpriv->restore_phase_mutex); @@ -1963,7 +1967,8 @@ out: static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, void __user *chunks, u32 num_chunks, - u64 *cs_seq, u32 flags, u32 timeout) + u64 *cs_seq, u32 flags, u32 timeout, + u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count) { struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL; bool handle_found = false, is_wait_cs = false, @@ -2195,6 +2200,9 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, goto free_cs_object; } + *signal_sob_addr_offset = cs->sob_addr_offset; + *signal_initial_sob_count = cs->initial_sob_count; + rc = HL_CS_STATUS_SUCCESS; if (is_wait_cs) wait_cs_submitted = true; @@ -2225,6 +2233,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) void __user *chunks; u32 num_chunks, flags, timeout, signals_count = 0, sob_addr = 0, handle_id = 0; + u16 sob_initial_count = 0; int rc; rc = hl_cs_sanity_checks(hpriv, args); @@ -2255,7 +2264,8 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) case CS_TYPE_WAIT: case CS_TYPE_COLLECTIVE_WAIT: rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks, - &cs_seq, args->in.cs_flags, timeout); + &cs_seq, args->in.cs_flags, timeout, + &sob_addr, &sob_initial_count); break; case CS_RESERVE_SIGNALS: rc = cs_ioctl_reserve_signals(hpriv, @@ -2271,20 +2281,33 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq, args->in.cs_flags, args->in.encaps_sig_handle_id, - timeout); + timeout, &sob_initial_count); break; } out: if (rc != -EAGAIN) { memset(args, 0, sizeof(*args)); - if (cs_type == CS_RESERVE_SIGNALS) { + switch (cs_type) { + case CS_RESERVE_SIGNALS: args->out.handle_id = handle_id; args->out.sob_base_addr_offset = sob_addr; args->out.count = signals_count; - } else { + break; + case CS_TYPE_SIGNAL: + args->out.sob_base_addr_offset = sob_addr; + args->out.sob_count_before_submission = sob_initial_count; + args->out.seq = cs_seq; + break; + case CS_TYPE_DEFAULT: + args->out.sob_count_before_submission = sob_initial_count; + args->out.seq = cs_seq; + break; + default: args->out.seq = cs_seq; + break; } + args->out.status = rc; } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 77ac4bb98137..93d0a85265be 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1545,6 +1545,9 @@ struct hl_userptr { * @submission_time_jiffies: submission time of the cs * @type: CS_TYPE_*. * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs. + * @sob_addr_offset: sob offset from the configuration base address. + * @initial_sob_count: count of completed signals in SOB before current submission of signal or + * cs with encaps signals. * @submitted: true if CS was submitted to H/W. * @completed: true if CS was completed by device. * @timedout : true if CS was timedout. @@ -1580,6 +1583,8 @@ struct hl_cs { u64 submission_time_jiffies; enum hl_cs_type type; u32 encaps_sig_hdl_id; + u32 sob_addr_offset; + u16 initial_sob_count; u8 submitted; u8 completed; u8 timedout; diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c index 0743319b10c7..fc841d651210 100644 --- a/drivers/misc/habanalabs/common/hw_queue.c +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -429,6 +429,9 @@ static int init_signal_cs(struct hl_device *hdev, rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1, false); + job->cs->sob_addr_offset = hw_sob->sob_addr; + job->cs->initial_sob_count = prop->next_sob_val - 1; + return rc; } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index cd86937c572d..648850b954a3 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -929,9 +929,17 @@ struct hl_cs_out { /* * SOB base address offset - * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set + * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY or HL_CS_FLAGS_SIGNAL is set */ __u32 sob_base_addr_offset; + + /* + * Count of completed signals in SOB before current signal submission. + * Valid only when (HL_CS_FLAGS_ENCAP_SIGNALS & HL_CS_FLAGS_STAGED_SUBMISSION) + * or HL_CS_FLAGS_SIGNAL is set + */ + __u16 sob_count_before_submission; + __u16 pad[3]; }; union hl_cs_args { -- cgit v1.2.3 From 2487f4a2812e520cb5b77b2b5dfcdc05c215cd83 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 22 Nov 2021 21:47:30 +0200 Subject: habanalabs: enable access to info ioctl during hard reset Because info ioctl is used to retrieve data, some of its opcodes may be used during hard reset. Other ioctls should be blocked while device is not operational. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 5 +---- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 7 ------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 7a277f442207..8be547b0926c 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -1146,9 +1146,6 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args) enum hl_cs_type cs_type; if (!hl_device_operational(hdev, &status)) { - dev_warn_ratelimited(hdev->dev, - "Device is %s. Can't submit new CS\n", - hdev->status[status]); return -EBUSY; } @@ -2997,7 +2994,7 @@ int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data) * user interrupt */ if (!hl_device_operational(hpriv->hdev, NULL)) - return -EPERM; + return -EBUSY; if (flags & HL_WAIT_CS_FLAGS_INTERRUPT) rc = hl_interrupt_wait_ioctl(hpriv, data); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 15797d55b4e8..6c7339978bae 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -774,7 +774,6 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg, const struct hl_ioctl_desc *ioctl, struct device *dev) { struct hl_fpriv *hpriv = filep->private_data; - struct hl_device *hdev = hpriv->hdev; unsigned int nr = _IOC_NR(cmd); char stack_kdata[128] = {0}; char *kdata = NULL; @@ -783,12 +782,6 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg, u32 hl_size; int retcode; - if (hdev->hard_reset_pending) { - dev_crit_ratelimited(dev, - "Device HARD reset pending! Please close FD\n"); - return -ENODEV; - } - /* Do not trust userspace, use our own definition */ func = ioctl->func; -- cgit v1.2.3 From d214636be8a6102d726c8aeb59000f2fb80d94a9 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 22 Nov 2021 12:29:22 +0200 Subject: habanalabs: pass reset flags to reset thread The reset flags used by the reset thread are currently a mix of hard-coded values and a specific flag which is passed from the context that initiates the reset. To make it easier to pass more flags in future from this context to the reset thread, modify it to pass all the original reset flags to the thread. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 10 +++------- drivers/misc/habanalabs/common/habanalabs.h | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 720eea0b7e9c..db4168f35c18 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -324,16 +324,12 @@ put_devices: static void device_hard_reset_pending(struct work_struct *work) { struct hl_device_reset_work *device_reset_work = - container_of(work, struct hl_device_reset_work, - reset_work.work); + container_of(work, struct hl_device_reset_work, reset_work.work); struct hl_device *hdev = device_reset_work->hdev; u32 flags; int rc; - flags = HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_RESET_THR; - - if (device_reset_work->fw_reset) - flags |= HL_DRV_RESET_BYPASS_REQ_TO_FW; + flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; rc = hl_device_reset(hdev, flags); if ((rc == -EBUSY) && !hdev->device_fini_pending) { @@ -1040,7 +1036,7 @@ again: hdev->process_kill_trial_cnt = 0; - hdev->device_reset_work.fw_reset = fw_reset; + hdev->device_reset_work.flags = flags; /* * Because the reset function can't run from heartbeat work, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 93d0a85265be..722fc8e69fd6 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2214,13 +2214,13 @@ struct hwmon_chip_info; * @wq: work queue for device reset procedure. * @reset_work: reset work to be done. * @hdev: habanalabs device structure. - * @fw_reset: whether f/w will do the reset without us sending them a message to do it. + * @flags: reset flags. */ struct hl_device_reset_work { struct workqueue_struct *wq; struct delayed_work reset_work; struct hl_device *hdev; - bool fw_reset; + u32 flags; }; /** -- cgit v1.2.3 From b166465452ac27415bc747c4c47c96d1314d06f1 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 29 Nov 2021 11:20:27 +0200 Subject: habanalabs: add missing kernel-doc comments for hl_device fields Add missing kernel-doc comments for the "last_error" and "stream_master_qid_arr" fields of the "hl_device" structure". Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 722fc8e69fd6..57bc55c2ddac 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2512,6 +2512,8 @@ struct last_error_session_info { * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. * @clk_throttling: holds information about current/previous clock throttling events + * @last_error: holds information about last session in which CS timeout or razwi error occurred. + * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This -- cgit v1.2.3 From fee187fe460b6b72a62e7d7b7193f8d675752544 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 30 Nov 2021 14:54:53 +0200 Subject: habanalabs: free signal handle on failure Fix a bug where in case of failure to allocate idr, the handle's memory wasn't freed as part of the error handling code. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 8be547b0926c..d169418197c0 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -1838,7 +1838,7 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, if (hdl_id < 0) { dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n"); rc = -EINVAL; - goto out; + goto free_handle; } handle->id = hdl_id; @@ -1891,7 +1891,9 @@ remove_idr: idr_remove(&mgr->handles, hdl_id); spin_unlock(&mgr->lock); +free_handle: kfree(handle); + out: return rc; } -- cgit v1.2.3 From a4dd2ecf36c4458db14df3aae81ec3e3f4b4688e Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 30 Nov 2021 17:04:13 +0200 Subject: habanalabs: remove redundant check on ctx_fini The driver supports only a single context. Therefore, no need to check if the user context that is closed is the compute context. The user context, if exists, is always the compute context. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index d0aaccd4df2c..4f7d39a29a42 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -97,10 +97,8 @@ static void hl_ctx_fini(struct hl_ctx *ctx) /* The engines are stopped as there is no executing CS, but the * Coresight might be still working by accessing addresses * related to the stopped engines. Hence stop it explicitly. - * Stop only if this is the compute context, as there can be - * only one compute context */ - if ((hdev->in_debug) && (hdev->compute_ctx == ctx)) + if (hdev->in_debug) hl_device_set_debug_mode(hdev, false); hdev->asic_funcs->ctx_fini(ctx); -- cgit v1.2.3 From 357ff3dc9ae5dc1a0d990801b32431f5eecc7ee9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 30 Nov 2021 15:28:23 +0200 Subject: habanalabs: save ctx inside encaps signal Compute context pointer in hdev shouldn't be used for fetching the context's pointer. If an object needs the context's pointer, it should get it while incrementing its kref, and when the object is released, put it. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 11 ++++++++--- drivers/misc/habanalabs/common/context.c | 10 +++++----- drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/common/hw_queue.c | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index d169418197c0..a63ebbc04787 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -1829,6 +1829,9 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, } handle->count = count; + + hl_ctx_get(hdev, hpriv->ctx); + handle->ctx = hpriv->ctx; mgr = &hpriv->ctx->sig_mgr; spin_lock(&mgr->lock); @@ -1838,7 +1841,7 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, if (hdl_id < 0) { dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n"); rc = -EINVAL; - goto free_handle; + goto put_ctx; } handle->id = hdl_id; @@ -1891,7 +1894,8 @@ remove_idr: idr_remove(&mgr->handles, hdl_id); spin_unlock(&mgr->lock); -free_handle: +put_ctx: + hl_ctx_put(handle->ctx); kfree(handle); out: @@ -1953,6 +1957,7 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) /* Release the id and free allocated memory of the handle */ idr_remove(&mgr->handles, handle_id); + hl_ctx_put(encaps_sig_hdl->ctx); kfree(encaps_sig_hdl); } else { rc = -EINVAL; diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 4f7d39a29a42..8291151948ef 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -13,13 +13,13 @@ void hl_encaps_handle_do_release(struct kref *ref) { struct hl_cs_encaps_sig_handle *handle = container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - struct hl_ctx *ctx = handle->hdev->compute_ctx; - struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr; + struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; spin_lock(&mgr->lock); idr_remove(&mgr->handles, handle->id); spin_unlock(&mgr->lock); + hl_ctx_put(handle->ctx); kfree(handle); } @@ -27,8 +27,7 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref) { struct hl_cs_encaps_sig_handle *handle = container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - struct hl_ctx *ctx = handle->hdev->compute_ctx; - struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr; + struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; /* if we're here, then there was a signals reservation but cs with * encaps signals wasn't submitted, so need to put refcount @@ -40,6 +39,7 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref) idr_remove(&mgr->handles, handle->id); spin_unlock(&mgr->lock); + hl_ctx_put(handle->ctx); kfree(handle); } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 57bc55c2ddac..0ad08fdc89ea 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2757,6 +2757,7 @@ struct hl_device { * wait cs are used to wait of the reserved encaps signals. * @hdev: pointer to habanalabs device structure. * @hw_sob: pointer to H/W SOB used in the reservation. + * @ctx: pointer to the user's context data structure * @cs_seq: staged cs sequence which contains encapsulated signals * @id: idr handler id to be used to fetch the handler info * @q_idx: stream queue index @@ -2767,6 +2768,7 @@ struct hl_cs_encaps_sig_handle { struct kref refcount; struct hl_device *hdev; struct hl_hw_sob *hw_sob; + struct hl_ctx *ctx; u64 cs_seq; u32 id; u32 q_idx; diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c index fc841d651210..6103e479e855 100644 --- a/drivers/misc/habanalabs/common/hw_queue.c +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -574,7 +574,7 @@ static int encaps_sig_first_staged_cs_handler struct hl_encaps_signals_mgr *mgr; int rc = 0; - mgr = &hdev->compute_ctx->sig_mgr; + mgr = &cs->ctx->sig_mgr; spin_lock(&mgr->lock); encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id); -- cgit v1.2.3 From 6798676f7ef5916133e0c915be73b7a3b7e2a312 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 30 Nov 2021 22:32:13 +0200 Subject: habanalabs: fix etr asid configuration Pass the user's context pointer into the etr configuration function to extract its ASID. Using the compute_ctx pointer is an error as it is just an indication of whether a user has opened the compute device. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 2 +- drivers/misc/habanalabs/common/device.c | 4 ++-- drivers/misc/habanalabs/common/habanalabs.h | 6 +++--- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 13 +++++++------ drivers/misc/habanalabs/gaudi/gaudiP.h | 4 ++-- drivers/misc/habanalabs/gaudi/gaudi_coresight.c | 4 ++-- drivers/misc/habanalabs/goya/goyaP.h | 4 ++-- drivers/misc/habanalabs/goya/goya_coresight.c | 4 ++-- 8 files changed, 21 insertions(+), 20 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 8291151948ef..8de1217b2ed2 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -99,7 +99,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx) * related to the stopped engines. Hence stop it explicitly. */ if (hdev->in_debug) - hl_device_set_debug_mode(hdev, false); + hl_device_set_debug_mode(hdev, ctx, false); hdev->asic_funcs->ctx_fini(ctx); hl_cb_va_pool_fini(ctx); diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index db4168f35c18..bc5736ae6b70 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -622,7 +622,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization) return 0; } -int hl_device_set_debug_mode(struct hl_device *hdev, bool enable) +int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable) { int rc = 0; @@ -637,7 +637,7 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable) } if (!hdev->hard_reset_pending) - hdev->asic_funcs->halt_coresight(hdev); + hdev->asic_funcs->halt_coresight(hdev, ctx); hdev->in_debug = 0; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 0ad08fdc89ea..670fad9b4ca0 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1288,7 +1288,7 @@ struct hl_asic_funcs { int (*send_heartbeat)(struct hl_device *hdev); void (*set_clock_gating)(struct hl_device *hdev); void (*disable_clock_gating)(struct hl_device *hdev); - int (*debug_coresight)(struct hl_device *hdev, void *data); + int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data); bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, struct seq_file *s); int (*non_hard_reset_late_init)(struct hl_device *hdev); @@ -1303,7 +1303,7 @@ struct hl_asic_funcs { int (*init_iatu)(struct hl_device *hdev); u32 (*rreg)(struct hl_device *hdev, u32 reg); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); - void (*halt_coresight)(struct hl_device *hdev); + void (*halt_coresight)(struct hl_device *hdev, struct hl_ctx *ctx); int (*ctx_init)(struct hl_ctx *ctx); void (*ctx_fini)(struct hl_ctx *ctx); int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); @@ -2867,7 +2867,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp); bool hl_device_operational(struct hl_device *hdev, enum hl_device_status *status); enum hl_device_status hl_device_status(struct hl_device *hdev); -int hl_device_set_debug_mode(struct hl_device *hdev, bool enable); +int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable); int hl_hw_queues_create(struct hl_device *hdev); void hl_hw_queues_destroy(struct hl_device *hdev); int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 6c7339978bae..9210114beefe 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -158,7 +158,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0; } -static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args) +static int debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, struct hl_debug_args *args) { struct hl_debug_params *params; void *input = NULL, *output = NULL; @@ -200,7 +200,7 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args) params->output_size = args->output_size; } - rc = hdev->asic_funcs->debug_coresight(hdev, params); + rc = hdev->asic_funcs->debug_coresight(hdev, ctx, params); if (rc) { dev_err(hdev->dev, "debug coresight operation failed %d\n", rc); @@ -738,13 +738,14 @@ static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data) "Rejecting debug configuration request because device not in debug mode\n"); return -EFAULT; } - args->input_size = - min(args->input_size, hl_debug_struct_size[args->op]); - rc = debug_coresight(hdev, args); + args->input_size = min(args->input_size, hl_debug_struct_size[args->op]); + rc = debug_coresight(hdev, hpriv->ctx, args); break; + case HL_DEBUG_OP_SET_MODE: - rc = hl_device_set_debug_mode(hdev, (bool) args->enable); + rc = hl_device_set_debug_mode(hdev, hpriv->ctx, (bool) args->enable); break; + default: dev_err(hdev->dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index f325e36a71e6..8ac16a9b7d15 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -357,8 +357,8 @@ void gaudi_init_security(struct hl_device *hdev); void gaudi_ack_protection_bits_errors(struct hl_device *hdev); void gaudi_add_device_attr(struct hl_device *hdev, struct attribute_group *dev_attr_grp); -int gaudi_debug_coresight(struct hl_device *hdev, void *data); -void gaudi_halt_coresight(struct hl_device *hdev); +int gaudi_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data); +void gaudi_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx); void gaudi_mmu_prepare_reg(struct hl_device *hdev, u64 reg, u32 asid); #endif /* GAUDIP_H_ */ diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c index 5349c1be13f9..08108f5fed67 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c +++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c @@ -848,7 +848,7 @@ static int gaudi_config_spmu(struct hl_device *hdev, return 0; } -int gaudi_debug_coresight(struct hl_device *hdev, void *data) +int gaudi_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data) { struct hl_debug_params *params = data; int rc = 0; @@ -887,7 +887,7 @@ int gaudi_debug_coresight(struct hl_device *hdev, void *data) return rc; } -void gaudi_halt_coresight(struct hl_device *hdev) +void gaudi_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx) { struct hl_debug_params params = {}; int i, rc; diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index f0c3c6df04d5..3740fd25bf84 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -220,8 +220,8 @@ void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq); void goya_add_device_attr(struct hl_device *hdev, struct attribute_group *dev_attr_grp); int goya_cpucp_info_get(struct hl_device *hdev); -int goya_debug_coresight(struct hl_device *hdev, void *data); -void goya_halt_coresight(struct hl_device *hdev); +int goya_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data); +void goya_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx); int goya_suspend(struct hl_device *hdev); int goya_resume(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c index c55c100fdd24..2c5133cfae65 100644 --- a/drivers/misc/habanalabs/goya/goya_coresight.c +++ b/drivers/misc/habanalabs/goya/goya_coresight.c @@ -652,7 +652,7 @@ static int goya_config_spmu(struct hl_device *hdev, return 0; } -int goya_debug_coresight(struct hl_device *hdev, void *data) +int goya_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data) { struct hl_debug_params *params = data; int rc = 0; @@ -691,7 +691,7 @@ int goya_debug_coresight(struct hl_device *hdev, void *data) return rc; } -void goya_halt_coresight(struct hl_device *hdev) +void goya_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx) { struct hl_debug_params params = {}; int i, rc; -- cgit v1.2.3 From 4337b50b5fe5ee64c821790f601ee6153bb9f027 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 30 Nov 2021 23:02:21 +0200 Subject: habanalabs: add helper to get compute context There are multiple places where the code needs to get the context's pointer and increment its ref cnt. This is the proper way instead of using the compute context pointer in the device structure. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 23 +++++++++++++++++++++++ drivers/misc/habanalabs/common/debugfs.c | 14 ++++++-------- drivers/misc/habanalabs/common/device.c | 13 ++++++------- drivers/misc/habanalabs/common/habanalabs.h | 1 + 4 files changed, 36 insertions(+), 15 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 8de1217b2ed2..b2884107fa15 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -272,6 +272,29 @@ int hl_ctx_put(struct hl_ctx *ctx) return kref_put(&ctx->refcount, hl_ctx_do_release); } +struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev) +{ + struct hl_ctx *ctx = NULL; + struct hl_fpriv *hpriv; + + mutex_lock(&hdev->fpriv_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) { + /* There can only be a single user which has opened the compute device, so exit + * immediately once we find him + */ + if (!hpriv->is_control) { + ctx = hpriv->ctx; + hl_ctx_get(hdev, ctx); + break; + } + } + + mutex_unlock(&hdev->fpriv_list_lock); + + return ctx; +} + /* * hl_ctx_get_fence_locked - get CS fence under CS lock * diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 9727d82b121f..2e9c31d79d5e 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -327,11 +327,7 @@ static int vm_show(struct seq_file *s, void *data) spin_unlock(&dev_entry->ctx_mem_hash_spinlock); - mutex_lock(&dev_entry->hdev->fpriv_list_lock); - ctx = dev_entry->hdev->compute_ctx; - if (ctx) - hl_ctx_get(dev_entry->hdev, ctx); - mutex_unlock(&dev_entry->hdev->fpriv_list_lock); + ctx = hl_get_compute_ctx(dev_entry->hdev); if (ctx) { seq_puts(s, "\nVA ranges:\n\n"); for (i = HL_VA_RANGE_TYPE_HOST ; i < HL_VA_RANGE_TYPE_MAX ; ++i) { @@ -443,7 +439,7 @@ static int mmu_show(struct seq_file *s, void *data) if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID) ctx = hdev->kernel_ctx; else - ctx = hdev->compute_ctx; + ctx = hl_get_compute_ctx(hdev); if (!ctx) { dev_err(hdev->dev, "no ctx available\n"); @@ -596,7 +592,7 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size, u64 *phys_addr) { struct hl_vm_phys_pg_pack *phys_pg_pack; - struct hl_ctx *ctx = hdev->compute_ctx; + struct hl_ctx *ctx; struct hl_vm_hash_node *hnode; u64 end_address, range_size; struct hl_userptr *userptr; @@ -604,6 +600,8 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size, bool valid = false; int i, rc = 0; + ctx = hl_get_compute_ctx(hdev); + if (!ctx) { dev_err(hdev->dev, "no ctx available\n"); return -EINVAL; diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index bc5736ae6b70..407f6c5020c7 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -961,6 +961,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, reset_upon_device_release = false; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; + struct hl_ctx *ctx; int i, rc; if (!hdev->init_done) { @@ -1101,16 +1102,14 @@ kill_processes: for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) hl_cq_reset(hdev, &hdev->completion_queue[i]); - mutex_lock(&hdev->fpriv_list_lock); - /* Make sure the context switch phase will run again */ - if (hdev->compute_ctx) { - atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1); - hdev->compute_ctx->thread_ctx_switch_wait_token = 0; + ctx = hl_get_compute_ctx(hdev); + if (ctx) { + atomic_set(&ctx->thread_ctx_switch_token, 1); + ctx->thread_ctx_switch_wait_token = 0; + hl_ctx_put(ctx); } - mutex_unlock(&hdev->fpriv_list_lock); - /* Finished tear-down, starting to re-initialize */ if (hard_reset) { diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 670fad9b4ca0..eec96e506bb0 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2906,6 +2906,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx); void hl_ctx_do_release(struct kref *ref); void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx); int hl_ctx_put(struct hl_ctx *ctx); +struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev); struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq); int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, struct hl_fence **fence, u32 arr_len); -- cgit v1.2.3 From 5b90e59d55d94aa939fae941db4a0e613e6ecc1e Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 30 Nov 2021 23:08:21 +0200 Subject: habanalabs: remove compute context pointer It was an error to save the compute context's pointer in the device structure, as it allowed its use without proper ref-cnt. Change the variable to a flag that only indicates whether there is an active compute context. Code that needs the pointer will now be forced to use proper internal APIs to get the pointer. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 2 +- drivers/misc/habanalabs/common/device.c | 10 +++++----- drivers/misc/habanalabs/common/habanalabs.h | 5 ++--- drivers/misc/habanalabs/common/habanalabs_drv.c | 2 +- drivers/misc/habanalabs/goya/goya.c | 4 ++-- drivers/misc/habanalabs/goya/goya_hwmgr.c | 4 ++-- 6 files changed, 13 insertions(+), 14 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index b2884107fa15..49e6f1172d18 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -165,7 +165,7 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv) hpriv->ctx = ctx; /* TODO: remove the following line for multiple process support */ - hdev->compute_ctx = ctx; + hdev->is_compute_ctx_active = true; return 0; diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 407f6c5020c7..bea05a59425f 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -97,12 +97,12 @@ static void hpriv_release(struct kref *ref) || hdev->reset_upon_device_release) hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); - /* Now we can mark the compute_ctx as empty. Even if a reset is running in a different + /* Now we can mark the compute_ctx as not active. Even if a reset is running in a different * thread, we don't care because the in_reset is marked so if a user will try to open - * the device it will fail on that, even if compute_ctx is NULL. + * the device it will fail on that, even if compute_ctx is false. */ mutex_lock(&hdev->fpriv_list_lock); - hdev->compute_ctx = NULL; + hdev->is_compute_ctx_active = false; mutex_unlock(&hdev->fpriv_list_lock); kfree(hpriv); @@ -1150,7 +1150,7 @@ kill_processes: goto out_err; } - hdev->compute_ctx = NULL; + hdev->is_compute_ctx_active = false; rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); if (rc) { @@ -1403,7 +1403,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) goto mmu_fini; } - hdev->compute_ctx = NULL; + hdev->is_compute_ctx_active = false; hdev->asic_funcs->state_dump_init(hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index eec96e506bb0..df1935952c28 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2503,7 +2503,6 @@ struct last_error_session_info { * @fpriv_list: list of file private data structures. Each structure is created * when a user opens the device * @fpriv_list_lock: protects the fpriv_list - * @compute_ctx: current compute context executing. * @aggregated_cs_counters: aggregated cs counters among all contexts * @mmu_priv: device-specific MMU data. * @mmu_func: device-related MMU functions. @@ -2601,6 +2600,7 @@ struct last_error_session_info { * cases where Linux was not loaded to device CPU * @supports_wait_for_multi_cs: true if wait for multi CS is supported * @is_in_soft_reset: Device is currently in soft reset process. + * @is_compute_ctx_active: Whether there is an active compute context executing. */ struct hl_device { struct pci_dev *pdev; @@ -2656,8 +2656,6 @@ struct hl_device { struct list_head fpriv_list; struct mutex fpriv_list_lock; - struct hl_ctx *compute_ctx; - struct hl_cs_counters_atomic aggregated_cs_counters; struct hl_mmu_priv mmu_priv; @@ -2730,6 +2728,7 @@ struct hl_device { u8 supports_wait_for_multi_cs; u8 stream_master_qid_arr_size; u8 is_in_soft_reset; + u8 is_compute_ctx_active; /* Parameters for bring-up */ u64 nic_ports_mask; diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index d4ef99952d15..62a02ef43bb7 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -161,7 +161,7 @@ int hl_device_open(struct inode *inode, struct file *filp) goto out_err; } - if (hdev->compute_ctx) { + if (hdev->is_compute_ctx_active) { dev_dbg_ratelimited(hdev->dev, "Can't open %s because another user is working on it\n", dev_name(hdev->dev)); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index e54d60e75854..8d0f2cd608fc 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -827,7 +827,7 @@ static void goya_set_freq_to_low_job(struct work_struct *work) mutex_lock(&hdev->fpriv_list_lock); - if (!hdev->compute_ctx) + if (!hdev->is_compute_ctx_active) goya_set_frequency(hdev, PLL_LOW); mutex_unlock(&hdev->fpriv_list_lock); diff --git a/drivers/misc/habanalabs/goya/goya_hwmgr.c b/drivers/misc/habanalabs/goya/goya_hwmgr.c index 42985a85b625..76b47749affe 100644 --- a/drivers/misc/habanalabs/goya/goya_hwmgr.c +++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -258,7 +258,7 @@ static ssize_t pm_mng_profile_store(struct device *dev, mutex_lock(&hdev->fpriv_list_lock); - if (hdev->compute_ctx) { + if (hdev->is_compute_ctx_active) { dev_err(hdev->dev, "Can't change PM profile while compute context is opened on the device\n"); count = -EPERM; -- cgit v1.2.3 From b02220536cb66ce1e357d78c944d6be07f1e1051 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Wed, 1 Dec 2021 10:52:27 +0200 Subject: habanalabs: wait again for multi-CS if no CS completed The original multi-CS design assumption that stream masters are used exclusively (i.e. multi-CS with set of stream master QIDs will not get completed by CS not from the multi-CS set) is inaccurate. Thus multi-CS behavior is now modified not to treat such case as an error. Instead, if we have multi-CS completion but we detect that no CS from the list is actually completed we will do another multi-CS wait (with modified timeout). Signed-off-by: Ohad Sharabi Reviewed-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 97 +++++++++++----------- drivers/misc/habanalabs/common/habanalabs.h | 4 +- 2 files changed, 50 insertions(+), 51 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index a63ebbc04787..f58fff3671d6 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -545,13 +545,6 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) * mcs fences. */ fence->mcs_handling_done = true; - /* - * Since CS (and its related fence) can be associated with only one - * multi CS context, once it triggered multi CS completion no need to - * continue checking other multi CS contexts. - */ - spin_unlock(&mcs_compl->lock); - break; } spin_unlock(&mcs_compl->lock); @@ -2498,6 +2491,21 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, return rc; } +static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs) +{ + if (usecs <= U32_MAX) + return usecs_to_jiffies(usecs); + + /* + * If the value in nanoseconds is larger than 64 bit, use the largest + * 64 bit value. + */ + if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC))) + return nsecs_to_jiffies(U64_MAX); + + return nsecs_to_jiffies(usecs * NSEC_PER_USEC); +} + /* * hl_wait_multi_cs_completion_init - init completion structure * @@ -2534,8 +2542,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init( } if (i == MULTI_CS_MAX_USER_CTX) { - dev_err(hdev->dev, - "no available multi-CS completion structure\n"); + dev_err(hdev->dev, "no available multi-CS completion structure\n"); return ERR_PTR(-ENOMEM); } return mcs_compl; @@ -2566,27 +2573,18 @@ static void hl_wait_multi_cs_completion_fini( * * @return 0 on success, otherwise non 0 error code */ -static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data) +static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data, + struct multi_cs_completion *mcs_compl) { - struct hl_device *hdev = mcs_data->ctx->hdev; - struct multi_cs_completion *mcs_compl; long completion_rc; - mcs_compl = hl_wait_multi_cs_completion_init(hdev, - mcs_data->stream_master_qid_map); - if (IS_ERR(mcs_compl)) - return PTR_ERR(mcs_compl); - - completion_rc = wait_for_completion_interruptible_timeout( - &mcs_compl->completion, - usecs_to_jiffies(mcs_data->timeout_us)); + completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion, + mcs_data->timeout_jiffies); /* update timestamp */ if (completion_rc > 0) mcs_data->timestamp = mcs_compl->timestamp; - hl_wait_multi_cs_completion_fini(mcs_compl); - mcs_data->wait_status = completion_rc; return 0; @@ -2619,6 +2617,7 @@ void hl_multi_cs_completion_init(struct hl_device *hdev) */ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) { + struct multi_cs_completion *mcs_compl; struct hl_device *hdev = hpriv->hdev; struct multi_cs_data mcs_data = {0}; union hl_wait_cs_args *args = data; @@ -2686,12 +2685,19 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) goto put_ctx; /* wait (with timeout) for the first CS to be completed */ - mcs_data.timeout_us = args->in.timeout_us; - rc = hl_wait_multi_cs_completion(&mcs_data); - if (rc) + mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us); + + mcs_compl = hl_wait_multi_cs_completion_init(hdev, mcs_data.stream_master_qid_map); + if (IS_ERR(mcs_compl)) { + rc = PTR_ERR(mcs_compl); goto put_ctx; + } + + while (true) { + rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl); + if (rc || (mcs_data.wait_status == 0)) + break; - if (mcs_data.wait_status > 0) { /* * poll fences once again to update the CS map. * no timestamp should be updated this time. @@ -2699,18 +2705,26 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) mcs_data.update_ts = false; rc = hl_cs_poll_fences(&mcs_data); + if (mcs_data.completion_bitmap) + break; + /* * if hl_wait_multi_cs_completion returned before timeout (i.e. - * it got a completion) we expect to see at least one CS - * completed after the poll function. + * it got a completion) it either got completed by CS in the multi CS list + * (in which case the indication will be non empty completion_bitmap) or it + * got completed by CS submitted to one of the shared stream master but + * not in the multi CS list (in which case we should wait again but reinit + * the completion, modify the timeout and set timestamp as zero to let a CS + * related to the current multi-CS set a new, relevant, timestamp) */ - if (!mcs_data.completion_bitmap) { - dev_warn_ratelimited(hdev->dev, - "Multi-CS got completion on wait but no CS completed\n"); - rc = -EFAULT; - } + /* wait again with modified timeout */ + mcs_data.timeout_jiffies = mcs_data.wait_status; + reinit_completion(&mcs_compl->completion); + mcs_compl->timestamp = 0; } + hl_wait_multi_cs_completion_fini(mcs_compl); + put_ctx: hl_ctx_put(ctx); kfree(fence_arr); @@ -2741,7 +2755,7 @@ free_seq_arr: } /* update if some CS was gone */ - if (mcs_data.timestamp) + if (!mcs_data.timestamp) args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE; } else { args->out.status = HL_WAIT_CS_STATUS_BUSY; @@ -2807,21 +2821,6 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) return 0; } -static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs) -{ - if (usecs <= U32_MAX) - return usecs_to_jiffies(usecs); - - /* - * If the value in nanoseconds is larger than 64 bit, use the largest - * 64 bit value. - */ - if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC))) - return nsecs_to_jiffies(U64_MAX); - - return nsecs_to_jiffies(usecs * NSEC_PER_USEC); -} - static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 user_address, u64 target_value, struct hl_user_interrupt *interrupt, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index df1935952c28..eda1c70f6966 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2362,7 +2362,7 @@ struct multi_cs_completion { * @ctx: pointer to the context structure * @fence_arr: array of fences of all CSs * @seq_arr: array of CS sequence numbers - * @timeout_us: timeout in usec for waiting for CS to complete + * @timeout_jiffies: timeout in jiffies for waiting for CS to complete * @timestamp: timestamp of first completed CS * @wait_status: wait for CS status * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0) @@ -2376,7 +2376,7 @@ struct multi_cs_data { struct hl_ctx *ctx; struct hl_fence **fence_arr; u64 *seq_arr; - s64 timeout_us; + s64 timeout_jiffies; s64 timestamp; long wait_status; u32 completion_bitmap; -- cgit v1.2.3 From 7c623ef732bdba440b1f0e74a99265cb7587df7e Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 7 Dec 2021 11:20:46 +0200 Subject: habanalabs: return correct clock throttling period Current clock throttling period returned from driver was wrong due to wrong time comparison. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 9210114beefe..f571641c19ae 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -335,9 +335,9 @@ static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args) ktime_to_us(hdev->clk_throttling.timestamp[i].start); if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time)) - end_time = ktime_get(); - else end_time = hdev->clk_throttling.timestamp[i].end; + else + end_time = ktime_get(); clk_throttle.clk_throttling_duration_ns[i] = ktime_to_ns(ktime_sub(end_time, -- cgit v1.2.3 From 7363805b8a52c9f5650f957a34a30788cc7ce4c2 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 8 Dec 2021 16:25:07 +0200 Subject: habanalabs: remove in_debug check in device open The driver supports only a single user anyway, so there is no point in checking whether we are in_debug state when a user tries to open the device, because if we are in_debug, it means a user is already using the device. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 5 +++-- drivers/misc/habanalabs/common/habanalabs_drv.c | 8 -------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index eda1c70f6966..362eee3f028c 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2561,8 +2561,9 @@ struct last_error_session_info { * @init_done: is the initialization of the device done. * @device_cpu_disabled: is the device CPU disabled (due to timeouts) * @dma_mask: the dma mask that was set for this device - * @in_debug: is device under debug. This, together with fpriv_list, enforces - * that only a single user is configuring the debug infrastructure. + * @in_debug: whether the device is in a state where the profiling/tracing infrastructure + * can be used. This indication is needed because in some ASICs we need to do + * specific operations to enable that infrastructure. * @power9_64bit_dma_enable: true to enable 64-bit DMA mask support. Relevant * only to POWER9 machines. * @cdev_sysfs_created: were char devices and sysfs nodes created. diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 62a02ef43bb7..d59201f93de9 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -153,14 +153,6 @@ int hl_device_open(struct inode *inode, struct file *filp) goto out_err; } - if (hdev->in_debug) { - dev_err_ratelimited(hdev->dev, - "Can't open %s because it is being debugged by another user\n", - dev_name(hdev->dev)); - rc = -EPERM; - goto out_err; - } - if (hdev->is_compute_ctx_active) { dev_dbg_ratelimited(hdev->dev, "Can't open %s because another user is working on it\n", -- cgit v1.2.3 From 9acdc21b0b04f370c306b7d95c296c7f22660fc0 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 8 Dec 2021 21:46:29 +0200 Subject: habanalabs: add current PI value to cpu packets In order to increase cpucp messaging reliability we will add the current PI value to the descriptor sent to F/W. F/W will wait for the PI value as an indication of a valid packet. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 76741898d922..34e70cca37c1 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -212,7 +212,8 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, struct asic_fixed_properties *prop = &hdev->asic_prop; struct cpucp_packet *pkt; dma_addr_t pkt_dma_addr; - u32 tmp, expected_ack_val; + struct hl_bd *sent_bd; + u32 tmp, expected_ack_val, pi; int rc = 0; pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len, @@ -237,6 +238,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, /* set fence to a non valid value */ pkt->fence = cpu_to_le32(UINT_MAX); + pi = queue->pi; /* * The CPU queue is a synchronous queue with an effective depth of @@ -246,7 +248,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, * Which means that we don't need to lock the access to the entire H/W * queues module when submitting a JOB to the CPU queue. */ - hl_hw_queue_submit_bd(hdev, queue, 0, len, pkt_dma_addr); + hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr); if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN) expected_ack_val = queue->pi; @@ -278,6 +280,14 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, *result = le64_to_cpu(pkt->result); } + /* Scrub previous buffer descriptor 'ctl' field which contains the + * previous PI value written during packet submission. + * We must do this or else F/W can read an old value upon queue wraparound. + */ + sent_bd = queue->kernel_address; + sent_bd += hl_pi_2_offset(pi); + sent_bd->ctl = cpu_to_le32(UINT_MAX); + out: mutex_unlock(&hdev->send_cpu_message_lock); -- cgit v1.2.3 From bb099a805104568c8babbf94824507b0d72ba232 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 12 Dec 2021 16:40:24 +0200 Subject: habanalabs: fix hwmon handling for legacy f/w In legacy f/w that use old hwmon.h file, the values of the hwmon enums are different than the values that are in newer kernels (5.6 and above). Therefore, to support working with those f/w, we need to do some fixup before registering with the hwmon subsystem and also when calling the functions that communicate with the f/w to retrieve sensors information. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/hwmon.c | 201 +++++++++++++++++++++++++++------ 1 file changed, 169 insertions(+), 32 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c index 70182b42940d..57f5d2c48330 100644 --- a/drivers/misc/habanalabs/common/hwmon.c +++ b/drivers/misc/habanalabs/common/hwmon.c @@ -10,17 +10,148 @@ #include #include -#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1) +#define HWMON_NR_SENSOR_TYPES (hwmon_max) -int hl_build_hwmon_channel_info(struct hl_device *hdev, - struct cpucp_sensor *sensors_arr) +#ifdef _HAS_HWMON_HWMON_T_ENABLE + +static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types type, + u32 cpucp_flags) { - u32 counts[HWMON_NR_SENSOR_TYPES] = {0}; - u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL}; + u32 flags; + + switch (type) { + case hwmon_temp: + flags = (cpucp_flags << 1) | HWMON_T_ENABLE; + break; + + case hwmon_in: + flags = (cpucp_flags << 1) | HWMON_I_ENABLE; + break; + + case hwmon_curr: + flags = (cpucp_flags << 1) | HWMON_C_ENABLE; + break; + + case hwmon_fan: + flags = (cpucp_flags << 1) | HWMON_F_ENABLE; + break; + + case hwmon_power: + flags = (cpucp_flags << 1) | HWMON_P_ENABLE; + break; + + case hwmon_pwm: + /* enable bit was here from day 1, so no need to adjust */ + flags = cpucp_flags; + break; + + default: + dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type); + flags = cpucp_flags; + break; + } + + return flags; +} + +static u32 fixup_attr_legacy_fw(u32 attr) +{ + return (attr - 1); +} + +#else + +static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types type, + u32 cpucp_flags) +{ + return cpucp_flags; +} + +static u32 fixup_attr_legacy_fw(u32 attr) +{ + return attr; +} + +#endif /* !_HAS_HWMON_HWMON_T_ENABLE */ + +static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types type, u32 cpucp_flags) +{ + u32 flags, cpucp_input_val; + bool use_cpucp_enum; + + use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; + + /* If f/w is using it's own enum, we need to check if the properties values are aligned. + * If not, it means we need to adjust the values to the new format that is used in the + * kernel since 5.6 (enum values were incremented by 1 by adding a new enable value). + */ + if (use_cpucp_enum) { + switch (type) { + case hwmon_temp: + cpucp_input_val = cpucp_temp_input; + if (cpucp_input_val == hwmon_temp_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_T_ENABLE; + break; + + case hwmon_in: + cpucp_input_val = cpucp_in_input; + if (cpucp_input_val == hwmon_in_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_I_ENABLE; + break; + + case hwmon_curr: + cpucp_input_val = cpucp_curr_input; + if (cpucp_input_val == hwmon_curr_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_C_ENABLE; + break; + + case hwmon_fan: + cpucp_input_val = cpucp_fan_input; + if (cpucp_input_val == hwmon_fan_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_F_ENABLE; + break; + + case hwmon_pwm: + /* enable bit was here from day 1, so no need to adjust */ + flags = cpucp_flags; + break; + + case hwmon_power: + cpucp_input_val = CPUCP_POWER_INPUT; + if (cpucp_input_val == hwmon_power_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_P_ENABLE; + break; + + default: + dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type); + flags = cpucp_flags; + break; + } + } else { + flags = fixup_flags_legacy_fw(hdev, type, cpucp_flags); + } + + return flags; +} + +int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr) +{ + u32 num_sensors_for_type, flags, num_active_sensor_types = 0, arr_size = 0, *curr_arr; u32 sensors_by_type_next_index[HWMON_NR_SENSOR_TYPES] = {0}; + u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL}; struct hwmon_channel_info **channels_info; - u32 num_sensors_for_type, num_active_sensor_types = 0, - arr_size = 0, *curr_arr; + u32 counts[HWMON_NR_SENSOR_TYPES] = {0}; enum hwmon_sensor_types type; int rc, i, j; @@ -31,8 +162,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, break; if (type >= HWMON_NR_SENSOR_TYPES) { - dev_err(hdev->dev, - "Got wrong sensor type %d from device\n", type); + dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type); return -EINVAL; } @@ -45,8 +175,9 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, continue; num_sensors_for_type = counts[i] + 1; - curr_arr = kcalloc(num_sensors_for_type, sizeof(*curr_arr), - GFP_KERNEL); + dev_dbg(hdev->dev, "num_sensors_for_type %d = %d\n", i, num_sensors_for_type); + + curr_arr = kcalloc(num_sensors_for_type, sizeof(*curr_arr), GFP_KERNEL); if (!curr_arr) { rc = -ENOMEM; goto sensors_type_err; @@ -59,20 +190,18 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, for (i = 0 ; i < arr_size ; i++) { type = le32_to_cpu(sensors_arr[i].type); curr_arr = sensors_by_type[type]; - curr_arr[sensors_by_type_next_index[type]++] = - le32_to_cpu(sensors_arr[i].flags); + flags = adjust_hwmon_flags(hdev, type, le32_to_cpu(sensors_arr[i].flags)); + curr_arr[sensors_by_type_next_index[type]++] = flags; } - channels_info = kcalloc(num_active_sensor_types + 1, - sizeof(*channels_info), GFP_KERNEL); + channels_info = kcalloc(num_active_sensor_types + 1, sizeof(*channels_info), GFP_KERNEL); if (!channels_info) { rc = -ENOMEM; goto channels_info_array_err; } for (i = 0 ; i < num_active_sensor_types ; i++) { - channels_info[i] = kzalloc(sizeof(*channels_info[i]), - GFP_KERNEL); + channels_info[i] = kzalloc(sizeof(*channels_info[i]), GFP_KERNEL); if (!channels_info[i]) { rc = -ENOMEM; goto channel_info_err; @@ -88,18 +217,19 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, j++; } - hdev->hl_chip_info->info = - (const struct hwmon_channel_info **)channels_info; + hdev->hl_chip_info->info = (const struct hwmon_channel_info **)channels_info; return 0; channel_info_err: - for (i = 0 ; i < num_active_sensor_types ; i++) + for (i = 0 ; i < num_active_sensor_types ; i++) { if (channels_info[i]) { kfree(channels_info[i]->config); kfree(channels_info[i]); } + } kfree(channels_info); + channels_info_array_err: sensors_type_err: for (i = 0 ; i < HWMON_NR_SENSOR_TYPES ; i++) @@ -112,14 +242,16 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { struct hl_device *hdev = dev_get_drvdata(dev); - int rc; + bool use_cpucp_enum; u32 cpucp_attr; - bool use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & - CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; + int rc; if (!hl_device_operational(hdev, NULL)) return -ENODEV; + use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; + switch (type) { case hwmon_temp: switch (attr) { @@ -151,7 +283,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) rc = hl_get_temperature(hdev, channel, cpucp_attr, val); else - rc = hl_get_temperature(hdev, channel, attr, val); + rc = hl_get_temperature(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_in: switch (attr) { @@ -174,7 +306,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) rc = hl_get_voltage(hdev, channel, cpucp_attr, val); else - rc = hl_get_voltage(hdev, channel, attr, val); + rc = hl_get_voltage(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_curr: switch (attr) { @@ -197,7 +329,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) rc = hl_get_current(hdev, channel, cpucp_attr, val); else - rc = hl_get_current(hdev, channel, attr, val); + rc = hl_get_current(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_fan: switch (attr) { @@ -217,7 +349,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) rc = hl_get_fan_speed(hdev, channel, cpucp_attr, val); else - rc = hl_get_fan_speed(hdev, channel, attr, val); + rc = hl_get_fan_speed(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_pwm: switch (attr) { @@ -234,6 +366,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) rc = hl_get_pwm_info(hdev, channel, cpucp_attr, val); else + /* no need for fixup as pwm was aligned from day 1 */ rc = hl_get_pwm_info(hdev, channel, attr, val); break; case hwmon_power: @@ -251,7 +384,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) rc = hl_get_power(hdev, channel, cpucp_attr, val); else - rc = hl_get_power(hdev, channel, attr, val); + rc = hl_get_power(hdev, channel, fixup_attr_legacy_fw(attr), val); break; default: return -EINVAL; @@ -286,7 +419,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) hl_set_temperature(hdev, channel, cpucp_attr, val); else - hl_set_temperature(hdev, channel, attr, val); + hl_set_temperature(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_pwm: switch (attr) { @@ -303,6 +436,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) hl_set_pwm_info(hdev, channel, cpucp_attr, val); else + /* no need for fixup as pwm was aligned from day 1 */ hl_set_pwm_info(hdev, channel, attr, val); break; case hwmon_in: @@ -317,7 +451,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) hl_set_voltage(hdev, channel, cpucp_attr, val); else - hl_set_voltage(hdev, channel, attr, val); + hl_set_voltage(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_curr: switch (attr) { @@ -331,7 +465,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) hl_set_current(hdev, channel, cpucp_attr, val); else - hl_set_current(hdev, channel, attr, val); + hl_set_current(hdev, channel, fixup_attr_legacy_fw(attr), val); break; case hwmon_power: switch (attr) { @@ -345,7 +479,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, if (use_cpucp_enum) hl_set_power(hdev, channel, cpucp_attr, val); else - hl_set_power(hdev, channel, attr, val); + hl_set_power(hdev, channel, fixup_attr_legacy_fw(attr), val); break; default: return -EINVAL; @@ -444,6 +578,9 @@ int hl_get_temperature(struct hl_device *hdev, pkt.sensor_index = __cpu_to_le16(sensor_index); pkt.type = __cpu_to_le16(attr); + dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n", + pkt.ctl, pkt.sensor_index, pkt.type); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); -- cgit v1.2.3 From 707c1252868d885c47b80613b60bdcb19e133397 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Wed, 8 Dec 2021 09:52:03 +0200 Subject: habanalabs: keep control device alive during hard reset Need to allow user retrieve data during reset and afterwards without the need to reopen the device. Did it by seperating the user peocesses list into two lists: 1. fpriv_list which contains list of user processes that opened the device (currently only one). 2. fpriv_ctrl_list which contains list of user processes that opened the control device. This processes in this list shall not be killed during reset, only when the device is suddenly removed from PCI chain. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 8 ++-- drivers/misc/habanalabs/common/device.c | 56 +++++++++++++++++-------- drivers/misc/habanalabs/common/habanalabs.h | 7 +++- drivers/misc/habanalabs/common/habanalabs_drv.c | 9 ++-- 4 files changed, 50 insertions(+), 30 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 49e6f1172d18..c6360e33bce8 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -283,11 +283,9 @@ struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev) /* There can only be a single user which has opened the compute device, so exit * immediately once we find him */ - if (!hpriv->is_control) { - ctx = hpriv->ctx; - hl_ctx_get(hdev, ctx); - break; - } + ctx = hpriv->ctx; + hl_ctx_get(hdev, ctx); + break; } mutex_unlock(&hdev->fpriv_list_lock); diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index bea05a59425f..f1f482c5cdcb 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -169,9 +169,9 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp) goto out; } - mutex_lock(&hdev->fpriv_list_lock); + mutex_lock(&hdev->fpriv_ctrl_list_lock); list_del(&hpriv->dev_node); - mutex_unlock(&hdev->fpriv_list_lock); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); out: put_pid(hpriv->taskpid); @@ -449,7 +449,9 @@ static int device_early_init(struct hl_device *hdev) INIT_LIST_HEAD(&hdev->cs_mirror_list); spin_lock_init(&hdev->cs_mirror_lock); INIT_LIST_HEAD(&hdev->fpriv_list); + INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); mutex_init(&hdev->fpriv_list_lock); + mutex_init(&hdev->fpriv_ctrl_list_lock); atomic_set(&hdev->in_reset, 0); mutex_init(&hdev->clk_throttling.lock); @@ -491,6 +493,7 @@ static void device_early_fini(struct hl_device *hdev) mutex_destroy(&hdev->send_cpu_message_lock); mutex_destroy(&hdev->fpriv_list_lock); + mutex_destroy(&hdev->fpriv_ctrl_list_lock); mutex_destroy(&hdev->clk_throttling.lock); @@ -678,6 +681,8 @@ static void take_release_locks(struct hl_device *hdev) /* Flush anyone that is inside device open */ mutex_lock(&hdev->fpriv_list_lock); mutex_unlock(&hdev->fpriv_list_lock); + mutex_lock(&hdev->fpriv_ctrl_list_lock); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); } static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset) @@ -789,17 +794,21 @@ disable_device: return rc; } -static int device_kill_open_processes(struct hl_device *hdev, u32 timeout) +static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev) { - struct hl_fpriv *hpriv; struct task_struct *task = NULL; + struct list_head *fd_list; + struct hl_fpriv *hpriv; + struct mutex *fd_lock; u32 pending_cnt; + fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; + fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; /* Giving time for user to close FD, and for processes that are inside * hl_device_open to finish */ - if (!list_empty(&hdev->fpriv_list)) + if (!list_empty(fd_list)) ssleep(1); if (timeout) { @@ -815,12 +824,12 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout) } } - mutex_lock(&hdev->fpriv_list_lock); + mutex_lock(fd_lock); /* This section must be protected because we are dereferencing * pointers that are freed if the process exits */ - list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) { + list_for_each_entry(hpriv, fd_list, dev_node) { task = get_pid_task(hpriv->taskpid, PIDTYPE_PID); if (task) { dev_info(hdev->dev, "Killing user process pid=%d\n", @@ -832,12 +841,12 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout) } else { dev_warn(hdev->dev, "Can't get task struct for PID so giving up on killing process\n"); - mutex_unlock(&hdev->fpriv_list_lock); + mutex_unlock(fd_lock); return -ETIME; } } - mutex_unlock(&hdev->fpriv_list_lock); + mutex_unlock(fd_lock); /* * We killed the open users, but that doesn't mean they are closed. @@ -849,7 +858,7 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout) */ wait_for_processes: - while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) { + while ((!list_empty(fd_list)) && (pending_cnt)) { dev_dbg(hdev->dev, "Waiting for all unmap operations to finish before hard reset\n"); @@ -859,7 +868,7 @@ wait_for_processes: } /* All processes exited successfully */ - if (list_empty(&hdev->fpriv_list)) + if (list_empty(fd_list)) return 0; /* Give up waiting for processes to exit */ @@ -871,14 +880,19 @@ wait_for_processes: return -EBUSY; } -static void device_disable_open_processes(struct hl_device *hdev) +static void device_disable_open_processes(struct hl_device *hdev, bool control_dev) { + struct list_head *fd_list; struct hl_fpriv *hpriv; + struct mutex *fd_lock; - mutex_lock(&hdev->fpriv_list_lock); - list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) + fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; + fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; + + mutex_lock(fd_lock); + list_for_each_entry(hpriv, fd_list, dev_node) hpriv->hdev = NULL; - mutex_unlock(&hdev->fpriv_list_lock); + mutex_unlock(fd_lock); } static void handle_reset_trigger(struct hl_device *hdev, u32 flags) @@ -1057,7 +1071,7 @@ kill_processes: * process can't really exit until all its CSs are done, which * is what we do in cs rollback */ - rc = device_kill_open_processes(hdev, 0); + rc = device_kill_open_processes(hdev, 0, false); if (rc == -EBUSY) { if (hdev->device_fini_pending) { @@ -1629,10 +1643,16 @@ void hl_device_fini(struct hl_device *hdev) "Waiting for all processes to exit (timeout of %u seconds)", HL_PENDING_RESET_LONG_SEC); - rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC); + rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false); if (rc) { dev_crit(hdev->dev, "Failed to kill all open processes\n"); - device_disable_open_processes(hdev); + device_disable_open_processes(hdev, false); + } + + rc = device_kill_open_processes(hdev, 0, true); + if (rc) { + dev_crit(hdev->dev, "Failed to kill all control device open processes\n"); + device_disable_open_processes(hdev, true); } hl_cb_pool_fini(hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 362eee3f028c..015aa1ee8ce0 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1824,7 +1824,6 @@ struct hl_debug_params { * @dev_node: node in the device list of file private data * @refcount: number of related contexts. * @restore_phase_mutex: lock for context switch and restore phase. - * @is_control: true for control device, false otherwise */ struct hl_fpriv { struct hl_device *hdev; @@ -1837,7 +1836,6 @@ struct hl_fpriv { struct list_head dev_node; struct kref refcount; struct mutex restore_phase_mutex; - u8 is_control; }; @@ -2502,7 +2500,10 @@ struct last_error_session_info { * @internal_cb_va_base: internal cb pool mmu virtual address base * @fpriv_list: list of file private data structures. Each structure is created * when a user opens the device + * @fpriv_ctrl_list: list of file private data structures. Each structure is created + * when a user opens the control device * @fpriv_list_lock: protects the fpriv_list + * @fpriv_ctrl_list_lock: protects the fpriv_ctrl_list * @aggregated_cs_counters: aggregated cs counters among all contexts * @mmu_priv: device-specific MMU data. * @mmu_func: device-related MMU functions. @@ -2655,7 +2656,9 @@ struct hl_device { u64 internal_cb_va_base; struct list_head fpriv_list; + struct list_head fpriv_ctrl_list; struct mutex fpriv_list_lock; + struct mutex fpriv_ctrl_list_lock; struct hl_cs_counters_atomic aggregated_cs_counters; diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index d59201f93de9..aa4e07b1f839 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -220,12 +220,11 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; - hpriv->is_control = true; nonseekable_open(inode, filp); hpriv->taskpid = find_get_pid(current->pid); - mutex_lock(&hdev->fpriv_list_lock); + mutex_lock(&hdev->fpriv_ctrl_list_lock); if (!hl_device_operational(hdev, NULL)) { dev_err_ratelimited(hdev->dev_ctrl, @@ -235,13 +234,13 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) goto out_err; } - list_add(&hpriv->dev_node, &hdev->fpriv_list); - mutex_unlock(&hdev->fpriv_list_lock); + list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); return 0; out_err: - mutex_unlock(&hdev->fpriv_list_lock); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); filp->private_data = NULL; put_pid(hpriv->taskpid); -- cgit v1.2.3 From b5c92b88823028bea4c74f3516c640406205933c Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 8 Dec 2021 15:00:10 +0200 Subject: habanalabs: sysfs support for two infineon versions Currently sysfs support dumping a single infineon version, in future asics we will have two infineon versions. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/sysfs.c | 9 +++++++-- drivers/misc/habanalabs/include/common/cpucp_if.h | 13 ++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 15e4ae65e515..6f575032f675 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -163,8 +163,13 @@ static ssize_t infineon_ver_show(struct device *dev, { struct hl_device *hdev = dev_get_drvdata(dev); - return sprintf(buf, "0x%04x\n", - hdev->asic_prop.cpucp_info.infineon_version); + if (hdev->asic_prop.cpucp_info.infineon_second_stage_version) + return sprintf(buf, "%#04x %#04x\n", + le32_to_cpu(hdev->asic_prop.cpucp_info.infineon_version), + le32_to_cpu(hdev->asic_prop.cpucp_info.infineon_second_stage_version)); + else + return sprintf(buf, "%#04x\n", + le32_to_cpu(hdev->asic_prop.cpucp_info.infineon_version)); } static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 078fb4bd0316..0114cb52faad 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2021 HabanaLabs, Ltd. + * Copyright 2020-2021 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -761,6 +761,7 @@ struct cpucp_security_info { * @fuse_version: silicon production FUSE information. * @thermal_version: thermald S/W version. * @cpucp_version: CpuCP S/W version. + * @infineon_second_stage_version: Infineon 2nd stage DC-DC version. * @dram_size: available DRAM size. * @card_name: card name that will be displayed in HWMON subsystem on the host * @sec_info: security information @@ -770,6 +771,10 @@ struct cpucp_security_info { * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance * (0 = functional 1 = binned) * @memory_repair_flag: eFuse flag indicating memory repair + * @edma_binning_mask: EDMA binning mask, 1 bit per EDMA instance + * (0 = functional 1 = binned) + * @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance + * (0 = functional 1 = binned) */ struct cpucp_info { struct cpucp_sensor sensors[CPUCP_MAX_SENSORS]; @@ -782,7 +787,7 @@ struct cpucp_info { __u8 fuse_version[VERSION_MAX_LEN]; __u8 thermal_version[VERSION_MAX_LEN]; __u8 cpucp_version[VERSION_MAX_LEN]; - __le32 reserved2; + __le32 infineon_second_stage_version; __le64 dram_size; char card_name[CARD_NAME_MAX_LEN]; __le64 reserved3; @@ -790,7 +795,9 @@ struct cpucp_info { __u8 reserved5; __u8 dram_binning_mask; __u8 memory_repair_flag; - __u8 pad[5]; + __u8 edma_binning_mask; + __u8 xbar_binning_mask; + __u8 pad[3]; struct cpucp_security_info sec_info; __le32 reserved6; __u8 pll_map[PLL_MAP_LEN]; -- cgit v1.2.3 From 9993f27de104d8d0f83c332ec5bc7642de20fae4 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 12 Dec 2021 17:46:21 +0200 Subject: habanalabs: expose soft reset sysfs nodes for inference ASIC As we allow soft-reset to be performed only on inference devices, having the sysfs nodes may cause a confusion. Hence, we remove those nodes on training ASICs. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/sysfs.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 6f575032f675..2f6de734ce37 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -424,8 +424,6 @@ static struct attribute *hl_dev_attrs[] = { &dev_attr_max_power.attr, &dev_attr_pci_addr.attr, &dev_attr_preboot_btl_ver.attr, - &dev_attr_soft_reset.attr, - &dev_attr_soft_reset_cnt.attr, &dev_attr_status.attr, &dev_attr_thermal_ver.attr, &dev_attr_uboot_ver.attr, @@ -450,6 +448,21 @@ static const struct attribute_group *hl_dev_attr_groups[] = { NULL, }; +static struct attribute *hl_dev_inference_attrs[] = { + &dev_attr_soft_reset.attr, + &dev_attr_soft_reset_cnt.attr, + NULL, +}; + +static struct attribute_group hl_dev_inference_attr_group = { + .attrs = hl_dev_inference_attrs, +}; + +static const struct attribute_group *hl_dev_inference_attr_groups[] = { + &hl_dev_inference_attr_group, + NULL, +}; + int hl_sysfs_init(struct hl_device *hdev) { int rc; @@ -465,10 +478,25 @@ int hl_sysfs_init(struct hl_device *hdev) return rc; } + if (!hdev->allow_inference_soft_reset) + return 0; + + rc = device_add_groups(hdev->dev, hl_dev_inference_attr_groups); + if (rc) { + dev_err(hdev->dev, + "Failed to add groups to device, error %d\n", rc); + return rc; + } + return 0; } void hl_sysfs_fini(struct hl_device *hdev) { device_remove_groups(hdev->dev, hl_dev_attr_groups); + + if (!hdev->allow_inference_soft_reset) + return; + + device_remove_groups(hdev->dev, hl_dev_inference_attr_groups); } -- cgit v1.2.3 From d636a932b3ab96523fe09c6148a0fa01f938b4f6 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Wed, 8 Dec 2021 09:06:03 +0200 Subject: habanalabs: clean MMU headers definitions During the MMU development the MMU header files were left with unclean definitions: - MMU "version specific" definitions that were left in the mmu_general file - unused definitions This patch attempts, where possible, to keep definitions that can serve multiple MMU versions (but that are not tightly bound with specific MMU arch) in the mmu_general header file (e.g. different definitions for number of HOPs). Otherwise, move MMU version specific definitions (e.g. HOPs masks and shifts) to the specific MMU version file. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/mmu/mmu_v1.c | 8 ++++---- drivers/misc/habanalabs/gaudi/gaudi.c | 24 +++++++++++----------- drivers/misc/habanalabs/goya/goya.c | 24 +++++++++++----------- .../habanalabs/include/hw_ip/mmu/mmu_general.h | 19 +++++------------ .../misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h | 18 +++++++++++++--- .../misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h | 20 ++++++++++++++---- 6 files changed, 64 insertions(+), 49 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c index 159da2fafd79..6134b6ae7615 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu_v1.c +++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c @@ -269,7 +269,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) num_of_hop3 = prop->dram_size_for_default_page_mapping; do_div(num_of_hop3, prop->dram_page_size); - do_div(num_of_hop3, PTE_ENTRIES_IN_HOP); + do_div(num_of_hop3, HOP_PTE_ENTRIES_512); /* add hop1 and hop2 */ total_hops = num_of_hop3 + 2; @@ -330,7 +330,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) for (i = 0 ; i < num_of_hop3 ; i++) { hop3_pte_addr = ctx->dram_default_hops[i]; - for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) { + for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) { write_final_pte(ctx, hop3_pte_addr, pte_val); get_pte(ctx, ctx->dram_default_hops[i]); hop3_pte_addr += HL_PTE_SIZE; @@ -369,7 +369,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx) num_of_hop3 = prop->dram_size_for_default_page_mapping; do_div(num_of_hop3, prop->dram_page_size); - do_div(num_of_hop3, PTE_ENTRIES_IN_HOP); + do_div(num_of_hop3, HOP_PTE_ENTRIES_512); hop0_addr = get_hop0_addr(ctx); /* add hop1 and hop2 */ @@ -379,7 +379,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx) for (i = 0 ; i < num_of_hop3 ; i++) { hop3_pte_addr = ctx->dram_default_hops[i]; - for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) { + for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) { clear_pte(ctx, hop3_pte_addr); put_pte(ctx, ctx->dram_default_hops[i]); hop3_pte_addr += HL_PTE_SIZE; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 07e03d44930e..b3431eac4f04 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -593,21 +593,21 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) else prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE; prop->mmu_pte_size = HL_PTE_SIZE; - prop->mmu_hop_table_size = HOP_TABLE_SIZE; - prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE; + prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE; + prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE; prop->dram_page_size = PAGE_SIZE_2MB; prop->dram_supports_virtual_memory = false; - prop->pmmu.hop0_shift = HOP0_SHIFT; - prop->pmmu.hop1_shift = HOP1_SHIFT; - prop->pmmu.hop2_shift = HOP2_SHIFT; - prop->pmmu.hop3_shift = HOP3_SHIFT; - prop->pmmu.hop4_shift = HOP4_SHIFT; - prop->pmmu.hop0_mask = HOP0_MASK; - prop->pmmu.hop1_mask = HOP1_MASK; - prop->pmmu.hop2_mask = HOP2_MASK; - prop->pmmu.hop3_mask = HOP3_MASK; - prop->pmmu.hop4_mask = HOP4_MASK; + prop->pmmu.hop0_shift = MMU_V1_1_HOP0_SHIFT; + prop->pmmu.hop1_shift = MMU_V1_1_HOP1_SHIFT; + prop->pmmu.hop2_shift = MMU_V1_1_HOP2_SHIFT; + prop->pmmu.hop3_shift = MMU_V1_1_HOP3_SHIFT; + prop->pmmu.hop4_shift = MMU_V1_1_HOP4_SHIFT; + prop->pmmu.hop0_mask = MMU_V1_1_HOP0_MASK; + prop->pmmu.hop1_mask = MMU_V1_1_HOP1_MASK; + prop->pmmu.hop2_mask = MMU_V1_1_HOP2_MASK; + prop->pmmu.hop3_mask = MMU_V1_1_HOP3_MASK; + prop->pmmu.hop4_mask = MMU_V1_1_HOP4_MASK; prop->pmmu.start_addr = VA_HOST_SPACE_START; prop->pmmu.end_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1; diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 8d0f2cd608fc..f4473013f1ee 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -410,21 +410,21 @@ int goya_set_fixed_properties(struct hl_device *hdev) else prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE; prop->mmu_pte_size = HL_PTE_SIZE; - prop->mmu_hop_table_size = HOP_TABLE_SIZE; - prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE; + prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE; + prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE; prop->dram_page_size = PAGE_SIZE_2MB; prop->dram_supports_virtual_memory = true; - prop->dmmu.hop0_shift = HOP0_SHIFT; - prop->dmmu.hop1_shift = HOP1_SHIFT; - prop->dmmu.hop2_shift = HOP2_SHIFT; - prop->dmmu.hop3_shift = HOP3_SHIFT; - prop->dmmu.hop4_shift = HOP4_SHIFT; - prop->dmmu.hop0_mask = HOP0_MASK; - prop->dmmu.hop1_mask = HOP1_MASK; - prop->dmmu.hop2_mask = HOP2_MASK; - prop->dmmu.hop3_mask = HOP3_MASK; - prop->dmmu.hop4_mask = HOP4_MASK; + prop->dmmu.hop0_shift = MMU_V1_0_HOP0_SHIFT; + prop->dmmu.hop1_shift = MMU_V1_0_HOP1_SHIFT; + prop->dmmu.hop2_shift = MMU_V1_0_HOP2_SHIFT; + prop->dmmu.hop3_shift = MMU_V1_0_HOP3_SHIFT; + prop->dmmu.hop4_shift = MMU_V1_0_HOP4_SHIFT; + prop->dmmu.hop0_mask = MMU_V1_0_HOP0_MASK; + prop->dmmu.hop1_mask = MMU_V1_0_HOP1_MASK; + prop->dmmu.hop2_mask = MMU_V1_0_HOP2_MASK; + prop->dmmu.hop3_mask = MMU_V1_0_HOP3_MASK; + prop->dmmu.hop4_mask = MMU_V1_0_HOP4_MASK; prop->dmmu.start_addr = VA_DDR_SPACE_START; prop->dmmu.end_addr = VA_DDR_SPACE_END; prop->dmmu.page_size = PAGE_SIZE_2MB; diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h index dedf20e8f956..758f246627f8 100644 --- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h +++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h @@ -16,27 +16,18 @@ #define PAGE_PRESENT_MASK 0x0000000000001ull #define SWAP_OUT_MASK 0x0000000000004ull #define LAST_MASK 0x0000000000800ull -#define HOP0_MASK 0x3000000000000ull -#define HOP1_MASK 0x0FF8000000000ull -#define HOP2_MASK 0x0007FC0000000ull -#define HOP3_MASK 0x000003FE00000ull -#define HOP4_MASK 0x00000001FF000ull #define FLAGS_MASK 0x0000000000FFFull -#define HOP0_SHIFT 48 -#define HOP1_SHIFT 39 -#define HOP2_SHIFT 30 -#define HOP3_SHIFT 21 -#define HOP4_SHIFT 12 - #define MMU_ARCH_5_HOPS 5 #define HOP_PHYS_ADDR_MASK (~FLAGS_MASK) #define HL_PTE_SIZE sizeof(u64) -#define HOP_TABLE_SIZE PAGE_SIZE_4KB -#define PTE_ENTRIES_IN_HOP (HOP_TABLE_SIZE / HL_PTE_SIZE) -#define HOP0_TABLES_TOTAL_SIZE (HOP_TABLE_SIZE * MAX_ASID) + +/* definitions for HOP with 512 PTE entries */ +#define HOP_PTE_ENTRIES_512 512 +#define HOP_TABLE_SIZE_512_PTE (HOP_PTE_ENTRIES_512 * HL_PTE_SIZE) +#define HOP0_512_PTE_TABLES_TOTAL_SIZE (HOP_TABLE_SIZE_512_PTE * MAX_ASID) #define MMU_HOP0_PA43_12_SHIFT 12 #define MMU_HOP0_PA49_44_SHIFT (12 + 32) diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h index 8539dd041f2c..86511002e367 100644 --- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h +++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h @@ -8,8 +8,20 @@ #ifndef INCLUDE_MMU_V1_0_H_ #define INCLUDE_MMU_V1_0_H_ -#define MMU_HOP0_PA43_12 0x490004 -#define MMU_HOP0_PA49_44 0x490008 -#define MMU_ASID_BUSY 0x490000 +#define MMU_V1_0_HOP0_MASK 0x3000000000000ull +#define MMU_V1_0_HOP1_MASK 0x0FF8000000000ull +#define MMU_V1_0_HOP2_MASK 0x0007FC0000000ull +#define MMU_V1_0_HOP3_MASK 0x000003FE00000ull +#define MMU_V1_0_HOP4_MASK 0x00000001FF000ull + +#define MMU_V1_0_HOP0_SHIFT 48 +#define MMU_V1_0_HOP1_SHIFT 39 +#define MMU_V1_0_HOP2_SHIFT 30 +#define MMU_V1_0_HOP3_SHIFT 21 +#define MMU_V1_0_HOP4_SHIFT 12 + +#define MMU_HOP0_PA43_12 0x490004 +#define MMU_HOP0_PA49_44 0x490008 +#define MMU_ASID_BUSY 0x490000 #endif /* INCLUDE_MMU_V1_0_H_ */ diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h index b2a9570583ac..9c727a5d47b4 100644 --- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h +++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h @@ -8,9 +8,21 @@ #ifndef INCLUDE_MMU_V1_1_H_ #define INCLUDE_MMU_V1_1_H_ -#define MMU_ASID 0xC12004 -#define MMU_HOP0_PA43_12 0xC12008 -#define MMU_HOP0_PA49_44 0xC1200C -#define MMU_BUSY 0xC12000 +#define MMU_V1_1_HOP0_MASK 0x3000000000000ull +#define MMU_V1_1_HOP1_MASK 0x0FF8000000000ull +#define MMU_V1_1_HOP2_MASK 0x0007FC0000000ull +#define MMU_V1_1_HOP3_MASK 0x000003FE00000ull +#define MMU_V1_1_HOP4_MASK 0x00000001FF000ull + +#define MMU_V1_1_HOP0_SHIFT 48 +#define MMU_V1_1_HOP1_SHIFT 39 +#define MMU_V1_1_HOP2_SHIFT 30 +#define MMU_V1_1_HOP3_SHIFT 21 +#define MMU_V1_1_HOP4_SHIFT 12 + +#define MMU_ASID 0xC12004 +#define MMU_HOP0_PA43_12 0xC12008 +#define MMU_HOP0_PA49_44 0xC1200C +#define MMU_BUSY 0xC12000 #endif /* INCLUDE_MMU_V1_1_H_ */ -- cgit v1.2.3 From 86c00b2c3639e33a7b51a06b1ebff0bae87686b7 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 13 Dec 2021 15:43:06 +0200 Subject: habanalabs: modify cpu boot status error print As BTL can be replaced by ROM we should modify relevant error print. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 34e70cca37c1..1d0d228d4872 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1113,7 +1113,7 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status) switch (status) { case CPU_BOOT_STATUS_NA: dev_err(hdev->dev, - "Device boot progress - BTL did NOT run\n"); + "Device boot progress - BTL/ROM did NOT run\n"); break; case CPU_BOOT_STATUS_IN_WFE: dev_err(hdev->dev, -- cgit v1.2.3 From e2558f0f84d85bfe2407b91d57798f133d8ad32a Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Tue, 7 Dec 2021 14:30:20 +0200 Subject: habanalabs: prevent wait if CS in multi-CS list completed By the original design we assumed that if we "miss" multi CS completion it is of no severe consequence as we'll just call wait_for_multi_cs again. Sequence of events for such scenario: 1. user submit CS with sequence N 2. user calls wait for multi-CS with only CS #N in the list 3. the multi CS call starts with poll of the CSs but find that none completed (while CS #N did not completed yet) 4. now, multi CS #N complete but multi CS CTX was not yet created for the above multi-CS. so, attempt to complete multi-CS fails (as no multi CS CTX exist) 5. wait_for_multi_cs call now does init_wait_multi_cs_completion (and for this create the multi-CS CTX) 6. wait_for_multi_cs wits on completion but will not get one as CS #N already completed To fix the issue we initialize the multi-CS CTX prior polling the fences. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 85 ++++++++++++++-------- drivers/misc/habanalabs/common/habanalabs.h | 3 - 2 files changed, 54 insertions(+), 34 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index f58fff3671d6..b9fed6b6d1ab 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -533,8 +533,8 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) mcs_compl->stream_master_qid_map)) { /* extract the timestamp only of first completed CS */ if (!mcs_compl->timestamp) - mcs_compl->timestamp = - ktime_to_ns(fence->timestamp); + mcs_compl->timestamp = ktime_to_ns(fence->timestamp); + complete_all(&mcs_compl->completion); /* @@ -2369,16 +2369,18 @@ static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence * hl_cs_poll_fences - iterate CS fences to check for CS completion * * @mcs_data: multi-CS internal data + * @mcs_compl: multi-CS completion structure * * @return 0 on success, otherwise non 0 error code * * The function iterates on all CS sequence in the list and set bit in * completion_bitmap for each completed CS. - * while iterating, the function can extracts the stream map to be later - * used by the waiting function. - * this function shall be called after taking context ref + * While iterating, the function sets the stream map of each fence in the fence + * array in the completion QID stream map to be used by CSs to perform + * completion to the multi-CS context. + * This function shall be called after taking context ref */ -static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) +static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl) { struct hl_fence **fence_ptr = mcs_data->fence_arr; struct hl_device *hdev = mcs_data->ctx->hdev; @@ -2394,6 +2396,15 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) if (rc) return rc; + /* + * re-initialize the completion here to handle 2 possible cases: + * 1. CS will complete the multi-CS prior clearing the completion. in which + * case the fence iteration is guaranteed to catch the CS completion. + * 2. the completion will occur after re-init of the completion. + * in which case we will wake up immediately in wait_for_completion. + */ + reinit_completion(&mcs_compl->completion); + /* * set to maximum time to verify timestamp is valid: if at the end * this value is maintained- no timestamp was updated @@ -2404,6 +2415,21 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) for (i = 0; i < arr_len; i++, fence_ptr++) { struct hl_fence *fence = *fence_ptr; + /* + * In order to prevent case where we wait until timeout even though a CS associated + * with the multi-CS actually completed we do things in the below order: + * 1. for each fence set it's QID map in the multi-CS completion QID map. This way + * any CS can, potentially, complete the multi CS for the specific QID (note + * that once completion is initialized, calling complete* and then wait on the + * completion will cause it to return at once) + * 2. only after allowing multi-CS completion for the specific QID we check whether + * the specific CS already completed (and thus the wait for completion part will + * be skipped). if the CS not completed it is guaranteed that completing CS will + * wake up the completion. + */ + if (fence) + mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map; + /* * function won't sleep as it is called with timeout 0 (i.e. * poll the fence) @@ -2419,9 +2445,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) switch (status) { case CS_WAIT_STATUS_BUSY: - /* CS did not finished, keep waiting on its QID*/ - mcs_data->stream_master_qid_map |= - fence->stream_master_qid_map; + /* CS did not finished, QID to wait on already stored */ break; case CS_WAIT_STATUS_COMPLETED: /* @@ -2519,9 +2543,7 @@ static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs) * the function gets the first available completion (by marking it "used") * and initialize its values. */ -static struct multi_cs_completion *hl_wait_multi_cs_completion_init( - struct hl_device *hdev, - u8 stream_master_bitmap) +static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev) { struct multi_cs_completion *mcs_compl; int i; @@ -2533,8 +2555,11 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init( if (!mcs_compl->used) { mcs_compl->used = 1; mcs_compl->timestamp = 0; - mcs_compl->stream_master_qid_map = stream_master_bitmap; - reinit_completion(&mcs_compl->completion); + /* + * init QID map to 0 to avoid completion by CSs. the actual QID map + * to multi-CS CSs will be set incrementally at a later stage + */ + mcs_compl->stream_master_qid_map = 0; spin_unlock(&mcs_compl->lock); break; } @@ -2672,9 +2697,17 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) hl_ctx_get(hdev, ctx); + /* wait (with timeout) for the first CS to be completed */ + mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us); + mcs_compl = hl_wait_multi_cs_completion_init(hdev); + if (IS_ERR(mcs_compl)) { + rc = PTR_ERR(mcs_compl); + goto put_ctx; + } + /* poll all CS fences, extract timestamp */ mcs_data.update_ts = true; - rc = hl_cs_poll_fences(&mcs_data); + rc = hl_cs_poll_fences(&mcs_data, mcs_compl); /* * skip wait for CS completion when one of the below is true: * - an error on the poll function @@ -2682,16 +2715,7 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) * - the user called ioctl with timeout 0 */ if (rc || mcs_data.completion_bitmap || !args->in.timeout_us) - goto put_ctx; - - /* wait (with timeout) for the first CS to be completed */ - mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us); - - mcs_compl = hl_wait_multi_cs_completion_init(hdev, mcs_data.stream_master_qid_map); - if (IS_ERR(mcs_compl)) { - rc = PTR_ERR(mcs_compl); - goto put_ctx; - } + goto completion_fini; while (true) { rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl); @@ -2703,7 +2727,7 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) * no timestamp should be updated this time. */ mcs_data.update_ts = false; - rc = hl_cs_poll_fences(&mcs_data); + rc = hl_cs_poll_fences(&mcs_data, mcs_compl); if (mcs_data.completion_bitmap) break; @@ -2713,16 +2737,15 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) * it got a completion) it either got completed by CS in the multi CS list * (in which case the indication will be non empty completion_bitmap) or it * got completed by CS submitted to one of the shared stream master but - * not in the multi CS list (in which case we should wait again but reinit - * the completion, modify the timeout and set timestamp as zero to let a CS - * related to the current multi-CS set a new, relevant, timestamp) + * not in the multi CS list (in which case we should wait again but modify + * the timeout and set timestamp as zero to let a CS related to the current + * multi-CS set a new, relevant, timestamp) */ - /* wait again with modified timeout */ mcs_data.timeout_jiffies = mcs_data.wait_status; - reinit_completion(&mcs_compl->completion); mcs_compl->timestamp = 0; } +completion_fini: hl_wait_multi_cs_completion_fini(mcs_compl); put_ctx: diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 015aa1ee8ce0..4d4986177776 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2364,8 +2364,6 @@ struct multi_cs_completion { * @timestamp: timestamp of first completed CS * @wait_status: wait for CS status * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0) - * @stream_master_qid_map: bitmap of all stream master QIDs on which the - * multi-CS is waiting * @arr_len: fence_arr and seq_arr array length * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0) * @update_ts: update timestamp. 1- update the timestamp, otherwise 0. @@ -2378,7 +2376,6 @@ struct multi_cs_data { s64 timestamp; long wait_status; u32 completion_bitmap; - u32 stream_master_qid_map; u8 arr_len; u8 gone_cs; u8 update_ts; -- cgit v1.2.3 From b9d31cada7d9f137028c11534fff77fec8511690 Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Tue, 2 Nov 2021 11:34:18 +0200 Subject: habanalabs: change wait_for_interrupt implementation Currently the cq counters are allocated in userspace memory, and mapped by the driver to the device address space. A new requirement that is part of new future API related to this one, requires that cq counters will be allocated in kernel memory. We leverage the existing cb_create API with KERNEL_MAPPED flag set to allocate this memory. That way we gain two things: 1. The memory cannot be freed while in use since it's protected by refcount in driver. 2. No need to wake up the user thread upon each interrupt from CQ, because the kernel has direct access to the counter. Therefore, it can make comparison with the target value in the interrupt handler and wake up the user thread only if the counter reaches the target value. This is instead of waking the thread up to copy counter value from user then go sleep again if target value wasn't reached. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 31 ++++-- .../misc/habanalabs/common/command_submission.c | 111 ++++++++++++++++++++- drivers/misc/habanalabs/common/habanalabs.h | 5 + drivers/misc/habanalabs/common/irq.c | 8 +- include/uapi/misc/habanalabs.h | 61 +++++++---- 5 files changed, 189 insertions(+), 27 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index c591f0487272..d4eb9fb9ea12 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -380,8 +380,9 @@ int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle) } static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr, - u64 cb_handle, u32 *usage_cnt) + u64 cb_handle, u32 flags, u32 *usage_cnt, u64 *device_va) { + struct hl_vm_va_block *va_block; struct hl_cb *cb; u32 handle; int rc = 0; @@ -402,7 +403,18 @@ static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr, goto out; } - *usage_cnt = atomic_read(&cb->cs_cnt); + if (flags & HL_CB_FLAGS_GET_DEVICE_VA) { + va_block = list_first_entry(&cb->va_block_list, struct hl_vm_va_block, node); + if (va_block) { + *device_va = va_block->start; + } else { + dev_err(hdev->dev, "CB is not mapped to the device's MMU\n"); + rc = -EINVAL; + goto out; + } + } else { + *usage_cnt = atomic_read(&cb->cs_cnt); + } out: spin_unlock(&mgr->cb_lock); @@ -414,7 +426,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) union hl_cb_args *args = data; struct hl_device *hdev = hpriv->hdev; enum hl_device_status status; - u64 handle = 0; + u64 handle = 0, device_va; u32 usage_cnt = 0; int rc; @@ -450,9 +462,16 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) case HL_CB_OP_INFO: rc = hl_cb_info(hdev, &hpriv->cb_mgr, args->in.cb_handle, - &usage_cnt); - memset(args, 0, sizeof(*args)); - args->out.usage_cnt = usage_cnt; + args->in.flags, + &usage_cnt, + &device_va); + + memset(&args->out, 0, sizeof(args->out)); + + if (args->in.flags & HL_CB_FLAGS_GET_DEVICE_VA) + args->out.device_va = device_va; + else + args->out.usage_cnt = usage_cnt; break; default: diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index b9fed6b6d1ab..7073fa6b9f0f 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2845,6 +2845,106 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) } static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, + struct hl_cb_mgr *cb_mgr, u64 timeout_us, + u64 cq_counters_handle, u64 cq_counters_offset, + u64 target_value, struct hl_user_interrupt *interrupt, + u32 *status, + u64 *timestamp) +{ + struct hl_user_pending_interrupt *pend; + unsigned long timeout, flags; + long completion_rc; + struct hl_cb *cb; + int rc = 0; + u32 handle; + + timeout = hl_usecs64_to_jiffies(timeout_us); + + hl_ctx_get(hdev, ctx); + + cq_counters_handle >>= PAGE_SHIFT; + handle = (u32) cq_counters_handle; + + cb = hl_cb_get(hdev, cb_mgr, handle); + if (!cb) { + hl_ctx_put(ctx); + return -EINVAL; + } + + pend = kzalloc(sizeof(*pend), GFP_KERNEL); + if (!pend) { + hl_cb_put(cb); + hl_ctx_put(ctx); + return -ENOMEM; + } + + hl_fence_init(&pend->fence, ULONG_MAX); + + pend->cq_kernel_addr = (u64 *) cb->kernel_address + cq_counters_offset; + pend->cq_target_value = target_value; + + /* We check for completion value as interrupt could have been received + * before we added the node to the wait list + */ + if (*pend->cq_kernel_addr >= target_value) { + *status = HL_WAIT_CS_STATUS_COMPLETED; + /* There was no interrupt, we assume the completion is now. */ + pend->fence.timestamp = ktime_get(); + } + + if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED)) + goto set_timestamp; + + /* Add pending user interrupt to relevant list for the interrupt + * handler to monitor + */ + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + /* Wait for interrupt handler to signal completion */ + completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion, + timeout); + if (completion_rc > 0) { + *status = HL_WAIT_CS_STATUS_COMPLETED; + } else { + if (completion_rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for interrupt ID %d\n", + interrupt->interrupt_id); + rc = -EINTR; + *status = HL_WAIT_CS_STATUS_ABORTED; + } else { + if (pend->fence.error == -EIO) { + dev_err_ratelimited(hdev->dev, + "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n", + pend->fence.error); + rc = -EIO; + *status = HL_WAIT_CS_STATUS_ABORTED; + } else { + dev_err_ratelimited(hdev->dev, "Waiting for interrupt ID %d timedout\n", + interrupt->interrupt_id); + rc = -ETIMEDOUT; + } + *status = HL_WAIT_CS_STATUS_BUSY; + } + } + + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_del(&pend->wait_list_node); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + +set_timestamp: + *timestamp = ktime_to_ns(pend->fence.timestamp); + + kfree(pend); + hl_cb_put(cb); + hl_ctx_put(ctx); + + return rc; +} + +static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 user_address, u64 target_value, struct hl_user_interrupt *interrupt, @@ -2861,7 +2961,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, hl_ctx_get(hdev, ctx); - pend = kmalloc(sizeof(*pend), GFP_KERNEL); + pend = kzalloc(sizeof(*pend), GFP_KERNEL); if (!pend) { hl_ctx_put(ctx); return -ENOMEM; @@ -2990,7 +3090,14 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) else interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt]; - rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, + if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ) + rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->cb_mgr, + args->in.interrupt_timeout_us, args->in.cq_counters_handle, + args->in.cq_counters_offset, + args->in.target, interrupt, &status, + ×tamp); + else + rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx, args->in.interrupt_timeout_us, args->in.addr, args->in.target, interrupt, &status, ×tamp); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 4d4986177776..78772fe548b9 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -876,10 +876,15 @@ struct hl_user_interrupt { * pending on an interrupt * @wait_list_node: node in the list of user threads pending on an interrupt * @fence: hl fence object for interrupt completion + * @cq_target_value: CQ target value + * @cq_kernel_addr: CQ kernel address, to be used in the cq interrupt + * handler for taget value comparison */ struct hl_user_pending_interrupt { struct list_head wait_list_node; struct hl_fence fence; + u64 cq_target_value; + u64 *cq_kernel_addr; }; /** diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c index 64e0d9de21bd..6454ea12bf3a 100644 --- a/drivers/misc/habanalabs/common/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -145,8 +145,12 @@ static void handle_user_cq(struct hl_device *hdev, spin_lock(&user_cq->wait_list_lock); list_for_each_entry(pend, &user_cq->wait_list_head, wait_list_node) { - pend->fence.timestamp = now; - complete_all(&pend->fence.completion); + if ((pend->cq_kernel_addr && + *(pend->cq_kernel_addr) >= pend->cq_target_value) || + !pend->cq_kernel_addr) { + pend->fence.timestamp = now; + complete_all(&pend->fence.completion); + } } spin_unlock(&user_cq->wait_list_lock); } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 648850b954a3..371dfc4243b3 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -680,7 +680,10 @@ struct hl_info_args { #define HL_MAX_CB_SIZE (0x200000 - 32) /* Indicates whether the command buffer should be mapped to the device's MMU */ -#define HL_CB_FLAGS_MAP 0x1 +#define HL_CB_FLAGS_MAP 0x1 + +/* Used with HL_CB_OP_INFO opcode to get the device va address for kernel mapped CB */ +#define HL_CB_FLAGS_GET_DEVICE_VA 0x2 struct hl_cb_in { /* Handle of CB or 0 if we want to create one */ @@ -702,11 +705,16 @@ struct hl_cb_out { /* Handle of CB */ __u64 cb_handle; - /* Information about CB */ - struct { - /* Usage count of CB */ - __u32 usage_cnt; - __u32 pad; + union { + /* Information about CB */ + struct { + /* Usage count of CB */ + __u32 usage_cnt; + __u32 pad; + }; + + /* CB mapped address to device MMU */ + __u64 device_va; }; }; }; @@ -947,9 +955,10 @@ union hl_cs_args { struct hl_cs_out out; }; -#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2 -#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000 -#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4 +#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2 +#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000 +#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4 +#define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ 0x10 #define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32 @@ -969,14 +978,23 @@ struct hl_wait_cs_in { }; struct { - /* User address for completion comparison. - * upon interrupt, driver will compare the value pointed - * by this address with the supplied target value. - * in order not to perform any comparison, set address - * to all 1s. - * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set - */ - __u64 addr; + union { + /* User address for completion comparison. + * upon interrupt, driver will compare the value pointed + * by this address with the supplied target value. + * in order not to perform any comparison, set address + * to all 1s. + * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set + */ + __u64 addr; + + /* cq_counters_handle to a kernel mapped cb which contains + * cq counters. + * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set + */ + __u64 cq_counters_handle; + }; + /* Target value for completion comparison */ __u64 target; }; @@ -1004,6 +1022,15 @@ struct hl_wait_cs_in { */ __u64 interrupt_timeout_us; }; + + /* + * cq counter offset inside the counters cb pointed by cq_counters_handle above. + * upon interrupt, driver will compare the value pointed + * by this address (cq_counters_handle + cq_counters_offset) + * with the supplied target value. + * relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set + */ + __u64 cq_counters_offset; }; #define HL_WAIT_CS_STATUS_COMPLETED 0 -- cgit v1.2.3 From a7224c21161b3576cb6875ac86f5ba5e757e4fce Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 15 Dec 2021 14:48:27 +0200 Subject: habanalabs: fix endianness when reading cpld version Current sysfs implementation does not take endianness into consideration when dumping the cpld version. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 2f6de734ce37..1af568e46f46 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -139,7 +139,7 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr, struct hl_device *hdev = dev_get_drvdata(dev); return sprintf(buf, "0x%08x\n", - hdev->asic_prop.cpucp_info.cpld_version); + le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version)); } static ssize_t cpucp_kernel_ver_show(struct device *dev, -- cgit v1.2.3 From 0a63ac769b4cb79dfe68efd06528e9174fb88162 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 19 Dec 2021 11:38:01 +0200 Subject: habanalabs: fix comments according to kernel-doc Fix missing fields, descriptions not according to kernel-doc style. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/memory.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 315594e96dcd..e5f7b23cbf94 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -518,7 +518,7 @@ static int add_va_block_locked(struct hl_device *hdev, /** * add_va_block() - wrapper for add_va_block_locked. * @hdev: pointer to the habanalabs device structure. - * @va_list: pointer to the virtual addresses block list. + * @va_range: pointer to the virtual addresses range object. * @start: start virtual address. * @end: end virtual address. * @@ -538,8 +538,11 @@ static inline int add_va_block(struct hl_device *hdev, } /** - * is_hint_crossing_range() - check if hint address crossing specified reserved - * range. + * is_hint_crossing_range() - check if hint address crossing specified reserved. + * @range_type: virtual space range type. + * @start_addr: start virtual address. + * @size: block size. + * @prop: asic properties structure to retrieve reserved ranges from. */ static inline bool is_hint_crossing_range(enum hl_va_range_type range_type, u64 start_addr, u32 size, struct asic_fixed_properties *prop) { @@ -749,6 +752,7 @@ u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx, /** * hl_get_va_range_type() - get va_range type for the given address and size. + * @ctx: context to fetch va_range from. * @address: the start address of the area we want to validate. * @size: the size in bytes of the area we want to validate. * @type: returned va_range type. @@ -776,8 +780,8 @@ static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size, * hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block. * @hdev: pointer to the habanalabs device structure * @ctx: pointer to the context structure. - * @start: start virtual address. - * @end: end virtual address. + * @start_addr: start virtual address. + * @size: number of bytes to unreserve. * * This function does the following: * - Takes the list lock and calls add_va_block_locked. @@ -2329,6 +2333,8 @@ void hl_userptr_delete_list(struct hl_device *hdev, /** * hl_userptr_is_pinned() - returns whether the given userptr is pinned. * @hdev: pointer to the habanalabs device structure. + * @addr: user address to check. + * @size: user block size to check. * @userptr_list: pointer to the list to clear. * @userptr: pointer to userptr to check. * @@ -2351,9 +2357,10 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, /** * va_range_init() - initialize virtual addresses range. * @hdev: pointer to the habanalabs device structure. - * @va_range: pointer to the range to initialize. + * @va_ranges: pointer to va_ranges array. * @start: range start address. * @end: range end address. + * @page_size: page size for this va_range. * * This function does the following: * - Initializes the virtual addresses list of the given range with the given @@ -2410,7 +2417,7 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range, /** * va_range_fini() - clear a virtual addresses range. * @hdev: pointer to the habanalabs structure. - * va_range: pointer to virtual addresses rang.e + * @va_range: pointer to virtual addresses range. * * This function does the following: * - Frees the virtual addresses block list and its lock. @@ -2430,12 +2437,15 @@ static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range) * @ctx: pointer to the habanalabs context structure. * @host_range_start: host virtual addresses range start. * @host_range_end: host virtual addresses range end. + * @host_page_size: host page size. * @host_huge_range_start: host virtual addresses range start for memory * allocated with huge pages. * @host_huge_range_end: host virtual addresses range end for memory allocated * with huge pages. + * @host_huge_page_size: host huge page size. * @dram_range_start: dram virtual addresses range start. * @dram_range_end: dram virtual addresses range end. + * @dram_page_size: dram page size. * * This function initializes the following: * - MMU for context. -- cgit v1.2.3 From 519f4ed0a09cdf3834c5cbde1416acd9a979a709 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 19 Dec 2021 16:06:59 +0200 Subject: habanalabs: replace some -ENOTTY with -EINVAL -ENOTTY is returned in case of error in the ioctl arguments themselves, such as function that doesn't exists. In all other cases, where the error is in the arguments of the custom data structures that we define that are passed in the various ioctls, we need to return -EINVAL. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 2 +- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 4 ++-- drivers/misc/habanalabs/common/memory.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index d4eb9fb9ea12..e7534b5129fa 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -475,7 +475,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) break; default: - rc = -ENOTTY; + rc = -EINVAL; break; } diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index f571641c19ae..7ddf70a0ca8a 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -693,7 +693,7 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, default: dev_err(dev, "Invalid request %d\n", args->op); - rc = -ENOTTY; + rc = -EINVAL; break; } @@ -748,7 +748,7 @@ static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data) default: dev_err(hdev->dev, "Invalid request %d\n", args->op); - rc = -ENOTTY; + rc = -EINVAL; break; } diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index e5f7b23cbf94..b8596846f3dc 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2031,7 +2031,7 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) default: dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); - rc = -ENOTTY; + rc = -EINVAL; break; } @@ -2156,7 +2156,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) default: dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); - rc = -ENOTTY; + rc = -EINVAL; break; } -- cgit v1.2.3 From f297a0e9fe7d4b4d8a24d2ce97446f2faaf9d51b Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 16 Dec 2021 16:31:18 +0200 Subject: habanalabs: add CPU-CP packet for engine core ASID cfg In some cases the driver cannot configure ASID of some engines due to the security level of the relevant registers. For this a new CPU-CP packet is introduced, which will allow the driver to ask the F/W to do this configuration instead. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 20 ++++++++++++++++++++ drivers/misc/habanalabs/common/habanalabs.h | 1 + drivers/misc/habanalabs/include/common/cpucp_if.h | 5 +++++ 3 files changed, 26 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 1d0d228d4872..2cc2015c2416 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1059,6 +1059,26 @@ out: return rc; } +int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_ENGINE_CORE_ASID_SET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.value = cpu_to_le64(asid); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, NULL); + if (rc) + dev_err(hdev->dev, + "Failed on ASID configuration request for engine core, error %d\n", + rc); + + return rc; +} + void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) { struct static_fw_load_mgr *static_loader = diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 78772fe548b9..fc1bdc07a169 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3065,6 +3065,7 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, int hl_fw_dram_replaced_row_get(struct hl_device *hdev, struct cpucp_hbm_row_info *info); int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num); +int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid); int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], bool is_wc[3]); int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 0114cb52faad..737c39f33f05 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -386,6 +386,9 @@ enum pq_init_status { * * CPUCP_PACKET_POWER_SET - * Resets power history of device to 0 + * + * CPUCP_PACKET_ENGINE_CORE_ASID_SET - + * Packet to perform engine core ASID configuration */ enum cpucp_packet_id { @@ -434,6 +437,8 @@ enum cpucp_packet_id { CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */ CPUCP_PACKET_HBM_PENDING_ROWS_STATUS, /* internal */ CPUCP_PACKET_POWER_SET, /* internal */ + CPUCP_PACKET_RESERVED, /* not used */ + CPUCP_PACKET_ENGINE_CORE_ASID_SET, /* internal */ }; #define CPUCP_PACKET_FENCE_VAL 0xFE8CE7A5 -- cgit v1.2.3 From 60bf3bfb5a37965fc33fa00f19a2074dd48077c5 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Mon, 20 Dec 2021 13:30:35 +0200 Subject: habanalabs: handle skip multi-CS if handling not done This patch fixes issue in which we have timeout for multi-CS although the CS in the list actually completed. Example scenario (the two threads marked as WAIT for the thread that handles the wait_for_multi_cs and CMPL as the thread that signal completion for both CS and multi-CS): 1. Submit CS with sequence X 2. [WAIT]: call wait_for_multi_cs with single CS X 3. [CMPL]: CS X do invoke complete_all for both CS and multi-CS (multi_cs_completion_done still false) 4. [WAIT]: enter poll_fences, reinit the completion and find the CS as completed when asking on the fence but multi_cs_done is still false it returns that no CS actually completed 5. [CMPL]: set multi_cs_handling_done as true 6. [WAIT]: wait for completion but no CS to awake the wait context and hence wait till timeout Solution: if CS detected as completed in poll_fences but multi_cs_done is still false invoke complete_all to the multi-CS completion and so it will not go to sleep in wait_for_completion but rather will have a "second chance" to wait for multi_cs_completion_done. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 7073fa6b9f0f..d39343f90bc2 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2453,9 +2453,19 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_com * returns to user indicating CS completed before it finished * all of its mcs handling, to avoid race the next time the * user waits for mcs. + * note: when reaching this case fence is definitely not NULL + * but NULL check was added to overcome static analysis */ - if (!fence->mcs_handling_done) + if (fence && !fence->mcs_handling_done) { + /* + * in case multi CS is completed but MCS handling not done + * we "complete" the multi CS to prevent it from waiting + * until time-out and the "multi-CS handling done" will have + * another chance at the next iteration + */ + complete_all(&mcs_compl->completion); break; + } mcs_data->completion_bitmap |= BIT(i); /* -- cgit v1.2.3 From eb135291912f7554e2a2472befc44818098baa8d Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 23 Nov 2021 15:15:22 +0200 Subject: habanalabs: refactor reset information variables Unify variables related to device reset, which will help us to add some new reset functionality in future patches. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 2 +- .../misc/habanalabs/common/command_submission.c | 4 +- drivers/misc/habanalabs/common/debugfs.c | 18 ++--- drivers/misc/habanalabs/common/device.c | 76 +++++++++++---------- drivers/misc/habanalabs/common/firmware_if.c | 6 +- drivers/misc/habanalabs/common/habanalabs.h | 79 ++++++++++++---------- drivers/misc/habanalabs/common/habanalabs_drv.c | 4 +- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 4 +- drivers/misc/habanalabs/common/irq.c | 2 +- drivers/misc/habanalabs/common/memory.c | 2 +- drivers/misc/habanalabs/common/sysfs.c | 10 +-- drivers/misc/habanalabs/gaudi/gaudi.c | 8 +-- drivers/misc/habanalabs/goya/goya.c | 10 +-- 13 files changed, 119 insertions(+), 106 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index e7534b5129fa..649380bb189f 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -250,7 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, * Can't use generic function to check this because of special case * where we create a CB as part of the reset process */ - if ((hdev->disabled) || ((atomic_read(&hdev->in_reset)) && + if ((hdev->disabled) || ((atomic_read(&hdev->reset_info.in_reset)) && (ctx_id != HL_KERNEL_ASID_ID))) { dev_warn_ratelimited(hdev->dev, "Device is disabled or in reset. Can't create new CBs\n"); diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index d39343f90bc2..0a4ef13d9ac4 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -777,7 +777,7 @@ static void cs_timedout(struct work_struct *work) if (hdev->reset_on_lockup) hl_device_reset(hdev, HL_DRV_RESET_TDR); else - hdev->needs_reset = true; + hdev->reset_info.needs_reset = true; } } @@ -814,7 +814,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS); cs->timeout_jiffies = timeout; cs->skip_reset_on_timeout = - hdev->skip_reset_on_timeout || + hdev->reset_info.skip_reset_on_timeout || !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT); cs->submission_time_jiffies = jiffies; INIT_LIST_HEAD(&cs->job_list); diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 2e9c31d79d5e..746d1a18de63 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -527,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data) struct hl_dbg_device_entry *dev_entry = entry->dev_entry; struct hl_device *hdev = dev_entry->hdev; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't check device idle during reset\n"); return 0; @@ -658,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf, ssize_t rc; u32 val; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't read during reset\n"); return 0; } @@ -694,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf, u32 value; ssize_t rc; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't write during reset\n"); return 0; } @@ -731,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf, ssize_t rc; u64 val; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't read during reset\n"); return 0; } @@ -767,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf, u64 value; ssize_t rc; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't write during reset\n"); return 0; } @@ -802,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf, ssize_t rc; u32 size; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n"); return 0; } @@ -1077,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf, u64 value; ssize_t rc; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't change clock gating during reset\n"); return 0; @@ -1119,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf, u32 value; ssize_t rc; - if (atomic_read(&hdev->in_reset)) { + if (atomic_read(&hdev->reset_info.in_reset)) { dev_warn_ratelimited(hdev->dev, "Can't change stop on error during reset\n"); return 0; @@ -1497,7 +1497,7 @@ void hl_debugfs_add_device(struct hl_device *hdev) debugfs_create_x8("skip_reset_on_timeout", 0644, dev_entry->root, - &hdev->skip_reset_on_timeout); + &hdev->reset_info.skip_reset_on_timeout); debugfs_create_file("state_dump", 0600, diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index f1f482c5cdcb..f8f9eb7a934f 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -17,9 +17,9 @@ enum hl_device_status hl_device_status(struct hl_device *hdev) { enum hl_device_status status; - if (atomic_read(&hdev->in_reset)) + if (atomic_read(&hdev->reset_info.in_reset)) status = HL_DEVICE_STATUS_IN_RESET; - else if (hdev->needs_reset) + else if (hdev->reset_info.needs_reset) status = HL_DEVICE_STATUS_NEEDS_RESET; else if (hdev->disabled) status = HL_DEVICE_STATUS_MALFUNCTION; @@ -452,7 +452,7 @@ static int device_early_init(struct hl_device *hdev) INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); mutex_init(&hdev->fpriv_list_lock); mutex_init(&hdev->fpriv_ctrl_list_lock); - atomic_set(&hdev->in_reset, 0); + atomic_set(&hdev->reset_info.in_reset, 0); mutex_init(&hdev->clk_throttling.lock); return 0; @@ -544,8 +544,8 @@ reschedule: * status for at least one heartbeat. From this point driver restarts * tracking future consecutive fatal errors. */ - if (!(atomic_read(&hdev->in_reset))) - hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + if (!(atomic_read(&hdev->reset_info.in_reset))) + hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; schedule_delayed_work(&hdev->work_heartbeat, usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); @@ -639,12 +639,12 @@ int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool en goto out; } - if (!hdev->hard_reset_pending) + if (!hdev->reset_info.hard_reset_pending) hdev->asic_funcs->halt_coresight(hdev, ctx); hdev->in_debug = 0; - if (!hdev->hard_reset_pending) + if (!hdev->reset_info.hard_reset_pending) hdev->asic_funcs->set_clock_gating(hdev); goto out; @@ -722,7 +722,7 @@ int hl_device_suspend(struct hl_device *hdev) pci_save_state(hdev->pdev); /* Block future CS/VM/JOB completion operations */ - rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); + rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); if (rc) { dev_err(hdev->dev, "Can't suspend while in reset\n"); return -EIO; @@ -777,7 +777,7 @@ int hl_device_resume(struct hl_device *hdev) hdev->disabled = false; - atomic_set(&hdev->in_reset, 0); + atomic_set(&hdev->reset_info.in_reset, 0); rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); if (rc) { @@ -906,16 +906,16 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) * 'reset_cause' will continue holding its 1st recorded reason! */ if (flags & HL_DRV_RESET_HEARTBEAT) { - hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; cur_reset_trigger = HL_DRV_RESET_HEARTBEAT; } else if (flags & HL_DRV_RESET_TDR) { - hdev->curr_reset_cause = HL_RESET_CAUSE_TDR; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR; cur_reset_trigger = HL_DRV_RESET_TDR; } else if (flags & HL_DRV_RESET_FW_FATAL_ERR) { - hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR; } else { - hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; } /* @@ -923,11 +923,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) * is set and if this reset is due to a fatal FW error * device is set to an unstable state. */ - if (hdev->prev_reset_trigger != cur_reset_trigger) { - hdev->prev_reset_trigger = cur_reset_trigger; - hdev->reset_trigger_repeated = 0; + if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) { + hdev->reset_info.prev_reset_trigger = cur_reset_trigger; + hdev->reset_info.reset_trigger_repeated = 0; } else { - hdev->reset_trigger_repeated = 1; + hdev->reset_info.reset_trigger_repeated = 1; } /* If reset is due to heartbeat, device CPU is no responsive in @@ -987,7 +987,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); - if (!hard_reset && !hdev->supports_soft_reset) { + if (!hard_reset && !hdev->asic_prop.supports_soft_reset) { hard_instead_soft = true; hard_reset = true; } @@ -1004,7 +1004,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) goto do_reset; } - if (!hard_reset && !hdev->allow_inference_soft_reset) { + if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { hard_instead_soft = true; hard_reset = true; } @@ -1024,13 +1024,14 @@ do_reset: */ if (!from_hard_reset_thread) { /* Block future CS/VM/JOB completion operations */ - rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); + rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); if (rc) return 0; handle_reset_trigger(hdev, flags); - hdev->is_in_soft_reset = !hard_reset; + /* This still allows the completion of some KDMA ops */ + hdev->reset_info.is_in_soft_reset = !hard_reset; /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; @@ -1047,7 +1048,7 @@ do_reset: again: if ((hard_reset) && (!from_hard_reset_thread)) { - hdev->hard_reset_pending = true; + hdev->reset_info.hard_reset_pending = true; hdev->process_kill_trial_cnt = 0; @@ -1128,10 +1129,11 @@ kill_processes: if (hard_reset) { hdev->device_cpu_disabled = false; - hdev->hard_reset_pending = false; + hdev->reset_info.hard_reset_pending = false; - if (hdev->reset_trigger_repeated && - (hdev->prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR)) { + if (hdev->reset_info.reset_trigger_repeated && + (hdev->reset_info.prev_reset_trigger == + HL_DRV_RESET_FW_FATAL_ERR)) { /* if there 2 back to back resets from FW, * ensure driver puts the driver in a unusable state */ @@ -1182,7 +1184,7 @@ kill_processes: * is required for the initialization itself */ hdev->disabled = false; - hdev->is_in_soft_reset = false; + hdev->reset_info.is_in_soft_reset = false; rc = hdev->asic_funcs->hw_init(hdev); if (rc) { @@ -1232,13 +1234,13 @@ kill_processes: } } - atomic_set(&hdev->in_reset, 0); - hdev->needs_reset = false; + atomic_set(&hdev->reset_info.in_reset, 0); + hdev->reset_info.needs_reset = false; dev_notice(hdev->dev, "Successfully finished resetting the device\n"); if (hard_reset) { - hdev->hard_reset_cnt++; + hdev->reset_info.hard_reset_cnt++; /* After reset is done, we are ready to receive events from * the F/W. We can't do it before because we will ignore events @@ -1247,30 +1249,30 @@ kill_processes: */ hdev->asic_funcs->enable_events_from_fw(hdev); } else if (!reset_upon_device_release) { - hdev->soft_reset_cnt++; + hdev->reset_info.soft_reset_cnt++; } return 0; out_err: hdev->disabled = true; - hdev->is_in_soft_reset = false; + hdev->reset_info.is_in_soft_reset = false; if (hard_reset) { dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n"); - hdev->hard_reset_cnt++; + hdev->reset_info.hard_reset_cnt++; } else if (reset_upon_device_release) { dev_err(hdev->dev, "Failed to reset device after user release\n"); hard_reset = true; goto again; } else { dev_err(hdev->dev, "Failed to do soft-reset\n"); - hdev->soft_reset_cnt++; + hdev->reset_info.soft_reset_cnt++; hard_reset = true; goto again; } - atomic_set(&hdev->in_reset, 0); + atomic_set(&hdev->reset_info.in_reset, 0); return rc; } @@ -1604,10 +1606,10 @@ void hl_device_fini(struct hl_device *hdev) */ timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000); - rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); + rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); while (rc) { usleep_range(50, 200); - rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); + rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); if (ktime_compare(ktime_get(), timeout) > 0) { dev_crit(hdev->dev, "Failed to remove device because reset function did not finish\n"); @@ -1629,7 +1631,7 @@ void hl_device_fini(struct hl_device *hdev) take_release_locks(hdev); - hdev->hard_reset_pending = true; + hdev->reset_info.hard_reset_pending = true; hl_hwmon_fini(hdev); diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 2cc2015c2416..6775c5c3166b 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -2371,14 +2371,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, if (rc) goto protocol_err; - if (hdev->curr_reset_cause) { + if (hdev->reset_info.curr_reset_cause) { rc = hl_fw_dynamic_send_msg(hdev, fw_loader, - HL_COMMS_RESET_CAUSE_TYPE, &hdev->curr_reset_cause); + HL_COMMS_RESET_CAUSE_TYPE, &hdev->reset_info.curr_reset_cause); if (rc) goto protocol_err; /* Clear current reset cause */ - hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; } if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) { diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index fc1bdc07a169..47eaeff9e924 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -547,6 +547,13 @@ struct hl_hints_range { * false otherwise. * @use_get_power_for_reset_history: To support backward compatibility for Goya * and Gaudi + * @supports_soft_reset: is soft reset supported. + * @allow_inference_soft_reset: true if the ASIC supports soft reset that is + * initiated by user or TDR. This is only true + * in inference ASICs, as there is no real-world + * use-case of doing soft-reset in training (due + * to the fact that training runs on multiple + * devices) */ struct asic_fixed_properties { struct hw_queue_properties *hw_queues_props; @@ -628,6 +635,8 @@ struct asic_fixed_properties { u8 dynamic_fw_load; u8 gic_interrupts_enable; u8 use_get_power_for_reset_history; + u8 supports_soft_reset; + u8 allow_inference_soft_reset; }; /** @@ -2446,6 +2455,39 @@ struct last_error_session_info { u8 razwi_type; }; +/** + * struct hl_reset_info - holds current device reset information. + * @in_reset: is device in reset flow. + * @soft_reset_cnt: number of soft reset since the driver was loaded. + * @hard_reset_cnt: number of hard reset since the driver was loaded. + * @is_in_soft_reset: Device is currently in soft reset process. + * @needs_reset: true if reset_on_lockup is false and device should be reset + * due to lockup. + * @hard_reset_pending: is there a hard reset work pending. + * @curr_reset_cause: saves an enumerated reset cause when a hard reset is + * triggered, and cleared after it is shared with preboot. + * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden + * with a new value on next reset + * @reset_trigger_repeated: set if device reset is triggered more than once with + * same cause. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. + */ +struct hl_reset_info { + atomic_t in_reset; + u32 soft_reset_cnt; + u32 hard_reset_cnt; + u8 is_in_soft_reset; + u8 needs_reset; + u8 hard_reset_pending; + + u8 curr_reset_cause; + u8 prev_reset_trigger; + u8 reset_trigger_repeated; + + u8 skip_reset_on_timeout; +}; + /** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. @@ -2514,6 +2556,7 @@ struct last_error_session_info { * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. * @clk_throttling: holds information about current/previous clock throttling events + * @reset_info: holds current device reset information. * @last_error: holds information about last session in which CS timeout or razwi error occurred. * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @dram_used_mem: current DRAM memory consumption. @@ -2538,13 +2581,10 @@ struct last_error_session_info { * session. * @open_counter: number of successful device open operations. * @fw_poll_interval_usec: FW status poll interval in usec. - * @in_reset: is device in reset flow. * @card_type: Various ASICs have several card types. This indicates the card * type of the current device. * @major: habanalabs kernel driver major. * @high_pll: high PLL profile frequency. - * @soft_reset_cnt: number of soft reset since the driver was loaded. - * @hard_reset_cnt: number of hard reset since the driver was loaded. * @id: device minor. * @id_control: minor of the control device * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit @@ -2552,7 +2592,6 @@ struct last_error_session_info { * @disabled: is device disabled. * @late_init_done: is late init stage was done during initialization. * @hwmon_initialized: is H/W monitor sensors was initialized. - * @hard_reset_pending: is there a hard reset work pending. * @heartbeat: is heartbeat sanity check towards CPU-CP enabled. * @reset_on_lockup: true if a reset should be done in case of stuck CS, false * otherwise. @@ -2575,35 +2614,17 @@ struct last_error_session_info { * @sync_stream_queue_idx: helper index for sync stream queues initialization. * @collective_mon_idx: helper index for collective initialization * @supports_coresight: is CoreSight supported. - * @supports_soft_reset: is soft reset supported. - * @allow_inference_soft_reset: true if the ASIC supports soft reset that is - * initiated by user or TDR. This is only true - * in inference ASICs, as there is no real-world - * use-case of doing soft-reset in training (due - * to the fact that training runs on multiple - * devices) * @supports_cb_mapping: is mapping a CB to the device's MMU supported. - * @needs_reset: true if reset_on_lockup is false and device should be reset - * due to lockup. * @process_kill_trial_cnt: number of trials reset thread tried killing * user processes * @device_fini_pending: true if device_fini was called and might be * waiting for the reset thread to finish * @supports_staged_submission: true if staged submissions are supported - * @curr_reset_cause: saves an enumerated reset cause when a hard reset is - * triggered, and cleared after it is shared with preboot. - * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden - * with a new value on next reset - * @reset_trigger_repeated: set if device reset is triggered more than once with - * same cause. - * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to - * complete instead. * @device_cpu_is_halted: Flag to indicate whether the device CPU was already * halted. We can't halt it again because the COMMS * protocol will throw an error. Relevant only for * cases where Linux was not loaded to device CPU * @supports_wait_for_multi_cs: true if wait for multi CS is supported - * @is_in_soft_reset: Device is currently in soft reset process. * @is_compute_ctx_active: Whether there is an active compute context executing. */ struct hl_device { @@ -2678,6 +2699,8 @@ struct hl_device { struct hl_clk_throttle clk_throttling; struct last_error_session_info last_error; + struct hl_reset_info reset_info; + u32 *stream_master_qid_arr; atomic64_t dram_used_mem; u64 timeout_jiffies; @@ -2689,20 +2712,16 @@ struct hl_device { u64 last_open_session_duration_jif; u64 open_counter; u64 fw_poll_interval_usec; - atomic_t in_reset; ktime_t last_successful_open_ktime; enum cpucp_card_types card_type; u32 major; u32 high_pll; - u32 soft_reset_cnt; - u32 hard_reset_cnt; u16 id; u16 id_control; u16 cpu_pci_msb_addr; u8 disabled; u8 late_init_done; u8 hwmon_initialized; - u8 hard_reset_pending; u8 heartbeat; u8 reset_on_lockup; u8 dram_default_page_mapping; @@ -2719,21 +2738,13 @@ struct hl_device { u8 sync_stream_queue_idx; u8 collective_mon_idx; u8 supports_coresight; - u8 supports_soft_reset; - u8 allow_inference_soft_reset; u8 supports_cb_mapping; - u8 needs_reset; u8 process_kill_trial_cnt; u8 device_fini_pending; u8 supports_staged_submission; - u8 curr_reset_cause; - u8 prev_reset_trigger; - u8 reset_trigger_repeated; - u8 skip_reset_on_timeout; u8 device_cpu_is_halted; u8 supports_wait_for_multi_cs; u8 stream_master_qid_arr_size; - u8 is_in_soft_reset; u8 is_compute_ctx_active; /* Parameters for bring-up */ diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index aa4e07b1f839..690b763c7a95 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -289,8 +289,8 @@ static int fixup_device_params(struct hl_device *hdev) hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; hdev->stop_on_err = true; - hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; - hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; /* Enable only after the initialization of the device */ hdev->disabled = true; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 7ddf70a0ca8a..3ba3a8ffda3e 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -269,8 +269,8 @@ static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args) if ((!max_size) || (!out)) return -EINVAL; - reset_count.hard_reset_cnt = hdev->hard_reset_cnt; - reset_count.soft_reset_cnt = hdev->soft_reset_cnt; + reset_count.hard_reset_cnt = hdev->reset_info.hard_reset_cnt; + reset_count.soft_reset_cnt = hdev->reset_info.soft_reset_cnt; return copy_to_user(out, &reset_count, min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0; diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c index 6454ea12bf3a..1b6bdc900c26 100644 --- a/drivers/misc/habanalabs/common/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -249,7 +249,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) */ dma_rmb(); - if (hdev->disabled && !hdev->is_in_soft_reset) { + if (hdev->disabled && !hdev->reset_info.is_in_soft_reset) { dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); goto skip_irq; } diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index b8596846f3dc..c1eefaebacb6 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2624,7 +2624,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) * Clearly something went wrong on hard reset so no point in printing * another side effect error */ - if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash)) + if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash)) dev_dbg(hdev->dev, "user released device without removing its memory mappings\n"); diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 1af568e46f46..45c715325e2a 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -211,7 +211,7 @@ static ssize_t soft_reset_store(struct device *dev, goto out; } - if (!hdev->allow_inference_soft_reset) { + if (!hdev->asic_prop.allow_inference_soft_reset) { dev_err(hdev->dev, "Device does not support inference soft-reset\n"); goto out; } @@ -303,7 +303,7 @@ static ssize_t soft_reset_cnt_show(struct device *dev, { struct hl_device *hdev = dev_get_drvdata(dev); - return sprintf(buf, "%d\n", hdev->soft_reset_cnt); + return sprintf(buf, "%d\n", hdev->reset_info.soft_reset_cnt); } static ssize_t hard_reset_cnt_show(struct device *dev, @@ -311,7 +311,7 @@ static ssize_t hard_reset_cnt_show(struct device *dev, { struct hl_device *hdev = dev_get_drvdata(dev); - return sprintf(buf, "%d\n", hdev->hard_reset_cnt); + return sprintf(buf, "%d\n", hdev->reset_info.hard_reset_cnt); } static ssize_t max_power_show(struct device *dev, struct device_attribute *attr, @@ -478,7 +478,7 @@ int hl_sysfs_init(struct hl_device *hdev) return rc; } - if (!hdev->allow_inference_soft_reset) + if (!hdev->asic_prop.allow_inference_soft_reset) return 0; rc = device_add_groups(hdev->dev, hl_dev_inference_attr_groups); @@ -495,7 +495,7 @@ void hl_sysfs_fini(struct hl_device *hdev) { device_remove_groups(hdev->dev, hl_dev_attr_groups); - if (!hdev->allow_inference_soft_reset) + if (!hdev->asic_prop.allow_inference_soft_reset) return; device_remove_groups(hdev->dev, hl_dev_inference_attr_groups); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index b3431eac4f04..013c6da2e3ca 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4325,7 +4325,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset * In case watchdog hasn't expired but we still got HB, then this won't do any * damage. */ - if (hdev->curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) { + if (hdev->reset_info.curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) { if (hdev->asic_prop.hard_reset_done_by_fw) hl_fw_ask_hard_reset_without_linux(hdev); else @@ -6564,7 +6564,7 @@ static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr) { struct gaudi_device *gaudi = hdev->asic_specific; - if (hdev->hard_reset_pending) + if (hdev->reset_info.hard_reset_pending) return U64_MAX; return readq(hdev->pcie_bar[HBM_BAR_ID] + @@ -6575,7 +6575,7 @@ static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val) { struct gaudi_device *gaudi = hdev->asic_specific; - if (hdev->hard_reset_pending) + if (hdev->reset_info.hard_reset_pending) return; writeq(val, hdev->pcie_bar[HBM_BAR_ID] + @@ -8341,7 +8341,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, int rc; if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) || - hdev->hard_reset_pending) + hdev->reset_info.hard_reset_pending) return 0; if (hdev->pldm) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index f4473013f1ee..fbcc7bbf44b3 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -1033,8 +1033,8 @@ static int goya_sw_init(struct hl_device *hdev) spin_lock_init(&goya->hw_queues_lock); hdev->supports_coresight = true; - hdev->supports_soft_reset = true; - hdev->allow_inference_soft_reset = true; + hdev->asic_prop.supports_soft_reset = true; + hdev->asic_prop.allow_inference_soft_reset = true; hdev->supports_wait_for_multi_cs = false; hdev->asic_funcs->set_pci_memory_regions(hdev); @@ -4477,7 +4477,7 @@ static u64 goya_read_pte(struct hl_device *hdev, u64 addr) { struct goya_device *goya = hdev->asic_specific; - if (hdev->hard_reset_pending) + if (hdev->reset_info.hard_reset_pending) return U64_MAX; return readq(hdev->pcie_bar[DDR_BAR_ID] + @@ -4488,7 +4488,7 @@ static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val) { struct goya_device *goya = hdev->asic_specific; - if (hdev->hard_reset_pending) + if (hdev->reset_info.hard_reset_pending) return; writeq(val, hdev->pcie_bar[DDR_BAR_ID] + @@ -5308,7 +5308,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, int rc; if (!(goya->hw_cap_initialized & HW_CAP_MMU) || - hdev->hard_reset_pending) + hdev->reset_info.hard_reset_pending) return 0; /* no need in L1 only invalidation in Goya */ -- cgit v1.2.3 From 42eb2872e0867679c996bb19ee9063e6141fa974 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 23 Nov 2021 15:15:22 +0200 Subject: habanalabs: add a lock to protect multiple reset variables Atomic operations during reset are replaced by a spinlock in order to have the ability to protect more than a single variable. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_buffer.c | 3 +- drivers/misc/habanalabs/common/debugfs.c | 16 ++++---- drivers/misc/habanalabs/common/device.c | 50 ++++++++++++++++++------- drivers/misc/habanalabs/common/habanalabs.h | 6 ++- 4 files changed, 49 insertions(+), 26 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 649380bb189f..3c0ae07a2d80 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -250,8 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, * Can't use generic function to check this because of special case * where we create a CB as part of the reset process */ - if ((hdev->disabled) || ((atomic_read(&hdev->reset_info.in_reset)) && - (ctx_id != HL_KERNEL_ASID_ID))) { + if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) { dev_warn_ratelimited(hdev->dev, "Device is disabled or in reset. Can't create new CBs\n"); rc = -EBUSY; diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 746d1a18de63..fc084ee5106e 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -527,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data) struct hl_dbg_device_entry *dev_entry = entry->dev_entry; struct hl_device *hdev = dev_entry->hdev; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't check device idle during reset\n"); return 0; @@ -658,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf, ssize_t rc; u32 val; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't read during reset\n"); return 0; } @@ -694,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf, u32 value; ssize_t rc; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't write during reset\n"); return 0; } @@ -731,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf, ssize_t rc; u64 val; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't read during reset\n"); return 0; } @@ -767,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf, u64 value; ssize_t rc; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't write during reset\n"); return 0; } @@ -802,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf, ssize_t rc; u32 size; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n"); return 0; } @@ -1077,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf, u64 value; ssize_t rc; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't change clock gating during reset\n"); return 0; @@ -1119,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf, u32 value; ssize_t rc; - if (atomic_read(&hdev->reset_info.in_reset)) { + if (hdev->reset_info.in_reset) { dev_warn_ratelimited(hdev->dev, "Can't change stop on error during reset\n"); return 0; diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index f8f9eb7a934f..84621ad765bc 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -17,7 +17,7 @@ enum hl_device_status hl_device_status(struct hl_device *hdev) { enum hl_device_status status; - if (atomic_read(&hdev->reset_info.in_reset)) + if (hdev->reset_info.in_reset) status = HL_DEVICE_STATUS_IN_RESET; else if (hdev->reset_info.needs_reset) status = HL_DEVICE_STATUS_NEEDS_RESET; @@ -448,11 +448,11 @@ static int device_early_init(struct hl_device *hdev) mutex_init(&hdev->debug_lock); INIT_LIST_HEAD(&hdev->cs_mirror_list); spin_lock_init(&hdev->cs_mirror_lock); + spin_lock_init(&hdev->reset_info.lock); INIT_LIST_HEAD(&hdev->fpriv_list); INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); mutex_init(&hdev->fpriv_list_lock); mutex_init(&hdev->fpriv_ctrl_list_lock); - atomic_set(&hdev->reset_info.in_reset, 0); mutex_init(&hdev->clk_throttling.lock); return 0; @@ -544,7 +544,7 @@ reschedule: * status for at least one heartbeat. From this point driver restarts * tracking future consecutive fatal errors. */ - if (!(atomic_read(&hdev->reset_info.in_reset))) + if (!hdev->reset_info.in_reset) hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; schedule_delayed_work(&hdev->work_heartbeat, @@ -722,11 +722,14 @@ int hl_device_suspend(struct hl_device *hdev) pci_save_state(hdev->pdev); /* Block future CS/VM/JOB completion operations */ - rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); - if (rc) { + spin_lock(&hdev->reset_info.lock); + if (hdev->reset_info.in_reset) { + spin_unlock(&hdev->reset_info.lock); dev_err(hdev->dev, "Can't suspend while in reset\n"); return -EIO; } + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); /* This blocks all other stuff that is not blocked by in_reset */ hdev->disabled = true; @@ -776,8 +779,10 @@ int hl_device_resume(struct hl_device *hdev) } - hdev->disabled = false; - atomic_set(&hdev->reset_info.in_reset, 0); + /* 'in_reset' was set to true during suspend, now we must clear it in order + * for hard reset to be performed + */ + hdev->reset_info.in_reset = 0; rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); if (rc) { @@ -1024,9 +1029,13 @@ do_reset: */ if (!from_hard_reset_thread) { /* Block future CS/VM/JOB completion operations */ - rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); - if (rc) + spin_lock(&hdev->reset_info.lock); + if (hdev->reset_info.in_reset) { + spin_unlock(&hdev->reset_info.lock); return 0; + } + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); handle_reset_trigger(hdev, flags); @@ -1234,7 +1243,7 @@ kill_processes: } } - atomic_set(&hdev->reset_info.in_reset, 0); + hdev->reset_info.in_reset = 0; hdev->reset_info.needs_reset = false; dev_notice(hdev->dev, "Successfully finished resetting the device\n"); @@ -1272,7 +1281,7 @@ out_err: goto again; } - atomic_set(&hdev->reset_info.in_reset, 0); + hdev->reset_info.in_reset = 0; return rc; } @@ -1583,6 +1592,7 @@ out_disabled: */ void hl_device_fini(struct hl_device *hdev) { + bool device_in_reset; ktime_t timeout; u64 reset_sec; int i, rc; @@ -1606,10 +1616,22 @@ void hl_device_fini(struct hl_device *hdev) */ timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000); - rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); - while (rc) { + + spin_lock(&hdev->reset_info.lock); + device_in_reset = !!hdev->reset_info.in_reset; + if (!device_in_reset) + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); + + while (device_in_reset) { usleep_range(50, 200); - rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1); + + spin_lock(&hdev->reset_info.lock); + device_in_reset = !!hdev->reset_info.in_reset; + if (!device_in_reset) + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); + if (ktime_compare(ktime_get(), timeout) > 0) { dev_crit(hdev->dev, "Failed to remove device because reset function did not finish\n"); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 47eaeff9e924..37a3a469b42f 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2457,9 +2457,10 @@ struct last_error_session_info { /** * struct hl_reset_info - holds current device reset information. - * @in_reset: is device in reset flow. + * @lock: lock to protect critical reset flows. * @soft_reset_cnt: number of soft reset since the driver was loaded. * @hard_reset_cnt: number of hard reset since the driver was loaded. + * @in_reset: is device in reset flow. * @is_in_soft_reset: Device is currently in soft reset process. * @needs_reset: true if reset_on_lockup is false and device should be reset * due to lockup. @@ -2474,9 +2475,10 @@ struct last_error_session_info { * complete instead. */ struct hl_reset_info { - atomic_t in_reset; + spinlock_t lock; u32 soft_reset_cnt; u32 hard_reset_cnt; + u8 in_reset; u8 is_in_soft_reset; u8 needs_reset; u8 hard_reset_pending; -- cgit v1.2.3 From ce80098db2439ee44403ec6fccd3a10be21c7aff Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 23 Nov 2021 16:34:28 +0200 Subject: habanalabs: support hard-reset scheduling during soft-reset As hard-reset can be requested during soft-reset, driver must allow it or else critical events received during soft-reset will be ignored. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 31 ++++++++++++++++++++++++++--- drivers/misc/habanalabs/common/habanalabs.h | 3 +++ 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 84621ad765bc..733338ab6f1d 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -978,7 +978,7 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) int hl_device_reset(struct hl_device *hdev, u32 flags) { bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, - reset_upon_device_release = false; + reset_upon_device_release = false, schedule_hard_reset = false; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; struct hl_ctx *ctx; int i, rc; @@ -1031,6 +1031,9 @@ do_reset: /* Block future CS/VM/JOB completion operations */ spin_lock(&hdev->reset_info.lock); if (hdev->reset_info.in_reset) { + /* We only allow scheduling of a hard reset during soft reset */ + if (hard_reset && hdev->reset_info.is_in_soft_reset) + hdev->reset_info.hard_reset_schedule_flags = flags; spin_unlock(&hdev->reset_info.lock); return 0; } @@ -1193,7 +1196,6 @@ kill_processes: * is required for the initialization itself */ hdev->disabled = false; - hdev->reset_info.is_in_soft_reset = false; rc = hdev->asic_funcs->hw_init(hdev); if (rc) { @@ -1243,7 +1245,20 @@ kill_processes: } } - hdev->reset_info.in_reset = 0; + spin_lock(&hdev->reset_info.lock); + hdev->reset_info.is_in_soft_reset = false; + + /* Schedule hard reset only if requested and if not already in hard reset. + * We keep 'in_reset' enabled, so no other reset can go in during the hard + * reset schedule + */ + if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags) + schedule_hard_reset = true; + else + hdev->reset_info.in_reset = 0; + + spin_unlock(&hdev->reset_info.lock); + hdev->reset_info.needs_reset = false; dev_notice(hdev->dev, "Successfully finished resetting the device\n"); @@ -1261,6 +1276,16 @@ kill_processes: hdev->reset_info.soft_reset_cnt++; } + if (schedule_hard_reset) { + dev_info(hdev->dev, "Performing hard reset scheduled during soft reset\n"); + flags = hdev->reset_info.hard_reset_schedule_flags; + hdev->reset_info.hard_reset_schedule_flags = 0; + hdev->disabled = true; + hard_reset = true; + handle_reset_trigger(hdev, flags); + goto again; + } + return 0; out_err: diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 37a3a469b42f..cb710fd478b6 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2460,6 +2460,8 @@ struct last_error_session_info { * @lock: lock to protect critical reset flows. * @soft_reset_cnt: number of soft reset since the driver was loaded. * @hard_reset_cnt: number of hard reset since the driver was loaded. + * @hard_reset_schedule_flags: hard reset is scheduled to after current soft reset, + * here we hold the hard reset flags. * @in_reset: is device in reset flow. * @is_in_soft_reset: Device is currently in soft reset process. * @needs_reset: true if reset_on_lockup is false and device should be reset @@ -2478,6 +2480,7 @@ struct hl_reset_info { spinlock_t lock; u32 soft_reset_cnt; u32 hard_reset_cnt; + u32 hard_reset_schedule_flags; u8 in_reset; u8 is_in_soft_reset; u8 needs_reset; -- cgit v1.2.3 From 38be5687da839bfcafeabb34a5a21e8396613ce3 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Thu, 23 Dec 2021 11:47:04 +0200 Subject: mei: add POWERING_DOWN into device state print The POWERING_DOWN state string was missing from the device states list, add it. Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20211223094705.204624-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/mei/init.c b/drivers/misc/mei/init.c index 5c8cb679b997..f79076c67256 100644 --- a/drivers/misc/mei/init.c +++ b/drivers/misc/mei/init.c @@ -24,6 +24,7 @@ const char *mei_dev_state_str(int state) MEI_DEV_STATE(ENABLED); MEI_DEV_STATE(RESETTING); MEI_DEV_STATE(DISABLED); + MEI_DEV_STATE(POWERING_DOWN); MEI_DEV_STATE(POWER_DOWN); MEI_DEV_STATE(POWER_UP); default: -- cgit v1.2.3 From 43aa323e315bec40779fe2899f7b531773d7b733 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Thu, 23 Dec 2021 11:47:05 +0200 Subject: mei: cleanup status before client dma setup call The upper layer may retry call to mei_cl_dma_alloc_and_map(), in that case the client status may be non-zero after the previous call and the wait condition will be true immediately. Set cl->status to zero to allow waiting for an actual result from the firmware. Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20211223094705.204624-2-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/client.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c index 96f4e59c32a5..22be86a205bf 100644 --- a/drivers/misc/mei/client.c +++ b/drivers/misc/mei/client.c @@ -2327,6 +2327,8 @@ int mei_cl_dma_alloc_and_map(struct mei_cl *cl, const struct file *fp, list_move_tail(&cb->list, &dev->ctrl_rd_list); } + cl->status = 0; + mutex_unlock(&dev->device_lock); wait_event_timeout(cl->wait, cl->dma_mapped || cl->status, @@ -2404,6 +2406,8 @@ int mei_cl_dma_unmap(struct mei_cl *cl, const struct file *fp) list_move_tail(&cb->list, &dev->ctrl_rd_list); } + cl->status = 0; + mutex_unlock(&dev->device_lock); wait_event_timeout(cl->wait, !cl->dma_mapped || cl->status, -- cgit v1.2.3 From 63064451d0b8359999e7e8c4fd92951d96f5a057 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Dec 2021 14:13:50 +0100 Subject: cxl: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the cxl code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: Frederic Barrat Cc: Andrew Donnellan Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20211228131350.249532-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c index c173a5e88c91..315c43f17dd3 100644 --- a/drivers/misc/cxl/sysfs.c +++ b/drivers/misc/cxl/sysfs.c @@ -570,6 +570,7 @@ static struct attribute *afu_cr_attrs[] = { &class_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(afu_cr); static void release_afu_config_record(struct kobject *kobj) { @@ -581,7 +582,7 @@ static void release_afu_config_record(struct kobject *kobj) static struct kobj_type afu_config_record_type = { .sysfs_ops = &kobj_sysfs_ops, .release = release_afu_config_record, - .default_attrs = afu_cr_attrs, + .default_groups = afu_cr_groups, }; static struct afu_config_record *cxl_sysfs_afu_new_cr(struct cxl_afu *afu, int cr_idx) -- cgit v1.2.3 From fcee5ce50bdb21116711e38635e3865594af907e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 28 Dec 2021 12:55:22 +0000 Subject: misc: lattice-ecp3-config: Fix task hung when firmware load failed When firmware load failed, kernel report task hung as follows: INFO: task xrun:5191 blocked for more than 147 seconds. Tainted: G W 5.16.0-rc5-next-20211220+ #11 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:xrun state:D stack: 0 pid: 5191 ppid: 270 flags:0x00000004 Call Trace: __schedule+0xc12/0x4b50 kernel/sched/core.c:4986 schedule+0xd7/0x260 kernel/sched/core.c:6369 (discriminator 1) schedule_timeout+0x7aa/0xa80 kernel/time/timer.c:1857 wait_for_completion+0x181/0x290 kernel/sched/completion.c:85 lattice_ecp3_remove+0x32/0x40 drivers/misc/lattice-ecp3-config.c:221 spi_remove+0x72/0xb0 drivers/spi/spi.c:409 lattice_ecp3_remove() wait for signals from firmware loading, but when load failed, firmware_load() does not send this signal. This cause device remove hung. Fix it by sending signal even if load failed. Fixes: 781551df57c7 ("misc: Add Lattice ECP3 FPGA configuration via SPI") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20211228125522.3122284-1-weiyongjun1@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/lattice-ecp3-config.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/lattice-ecp3-config.c b/drivers/misc/lattice-ecp3-config.c index 0f54730c7ed5..98828030b5a4 100644 --- a/drivers/misc/lattice-ecp3-config.c +++ b/drivers/misc/lattice-ecp3-config.c @@ -76,12 +76,12 @@ static void firmware_load(const struct firmware *fw, void *context) if (fw == NULL) { dev_err(&spi->dev, "Cannot load firmware, aborting\n"); - return; + goto out; } if (fw->size == 0) { dev_err(&spi->dev, "Error: Firmware size is 0!\n"); - return; + goto out; } /* Fill dummy data (24 stuffing bits for commands) */ @@ -103,7 +103,7 @@ static void firmware_load(const struct firmware *fw, void *context) dev_err(&spi->dev, "Error: No supported FPGA detected (JEDEC_ID=%08x)!\n", jedec_id); - return; + goto out; } dev_info(&spi->dev, "FPGA %s detected\n", ecp3_dev[i].name); @@ -116,7 +116,7 @@ static void firmware_load(const struct firmware *fw, void *context) buffer = kzalloc(fw->size + 8, GFP_KERNEL); if (!buffer) { dev_err(&spi->dev, "Error: Can't allocate memory!\n"); - return; + goto out; } /* @@ -155,7 +155,7 @@ static void firmware_load(const struct firmware *fw, void *context) "Error: Timeout waiting for FPGA to clear (status=%08x)!\n", status); kfree(buffer); - return; + goto out; } dev_info(&spi->dev, "Configuring the FPGA...\n"); @@ -181,7 +181,7 @@ static void firmware_load(const struct firmware *fw, void *context) release_firmware(fw); kfree(buffer); - +out: complete(&data->fw_loaded); } -- cgit v1.2.3 From 6b0b80ac103b2a40c72a47c301745fd1f4ef4697 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Tue, 28 Dec 2021 10:20:47 +0200 Subject: mei: hbm: fix client dma reply status Don't blindly copy status value received from the firmware into internal client status field, It may be positive and ERR_PTR(ret) will translate it into an invalid address and the caller will crash. Put the error code into the client status on failure. Fixes: 369aea845951 ("mei: implement client dma setup.") Cc: # v5.11+ Reported-by: Emmanuel Grumbach Tested-by: : Emmanuel Grumbach Acked-by: Tomas Winkler Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20211228082047.378115-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hbm.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c index be41843df75b..cebcca6d6d3e 100644 --- a/drivers/misc/mei/hbm.c +++ b/drivers/misc/mei/hbm.c @@ -672,10 +672,14 @@ static void mei_hbm_cl_dma_map_res(struct mei_device *dev, if (!cl) return; - dev_dbg(dev->dev, "cl dma map result = %d\n", res->status); - cl->status = res->status; - if (!cl->status) + if (res->status) { + dev_err(dev->dev, "cl dma map failed %d\n", res->status); + cl->status = -EFAULT; + } else { + dev_dbg(dev->dev, "cl dma map succeeded\n"); cl->dma_mapped = 1; + cl->status = 0; + } wake_up(&cl->wait); } @@ -698,10 +702,14 @@ static void mei_hbm_cl_dma_unmap_res(struct mei_device *dev, if (!cl) return; - dev_dbg(dev->dev, "cl dma unmap result = %d\n", res->status); - cl->status = res->status; - if (!cl->status) + if (res->status) { + dev_err(dev->dev, "cl dma unmap failed %d\n", res->status); + cl->status = -EFAULT; + } else { + dev_dbg(dev->dev, "cl dma unmap succeeded\n"); cl->dma_mapped = 0; + cl->status = 0; + } wake_up(&cl->wait); } -- cgit v1.2.3