summaryrefslogtreecommitdiff
path: root/drivers/pci
diff options
context:
space:
mode:
authorLukas Wunner <lukas@wunner.de>2023-01-20 10:19:02 +0100
committerBjorn Helgaas <bhelgaas@google.com>2023-02-15 15:01:01 -0600
commit74ff8864cc842be994853095dba6db48e716400a (patch)
treeff56f04498915fd6b296bf175f3d5f01ff2cc0e1 /drivers/pci
parent82b34b0800af8c9fc9988c290cdc813e0ca0df31 (diff)
PCI: hotplug: Allow marking devices as disconnected during bind/unbind
On surprise removal, pciehp_unconfigure_device() and acpiphp's trim_stale_devices() call pci_dev_set_disconnected() to mark removed devices as permanently offline. Thereby, the PCI core and drivers know to skip device accesses. However pci_dev_set_disconnected() takes the device_lock and thus waits for a concurrent driver bind or unbind to complete. As a result, the driver's ->probe and ->remove hooks have no chance to learn that the device is gone. That doesn't make any sense, so drop the device_lock and instead use atomic xchg() and cmpxchg() operations to update the device state. As a byproduct, an AB-BA deadlock reported by Anatoli is fixed which occurs on surprise removal with AER concurrently performing a bus reset. AER bus reset: INFO: task irq/26-aerdrv:95 blocked for more than 120 seconds. Tainted: G W 6.2.0-rc3-custom-norework-jan11+ schedule rwsem_down_write_slowpath down_write_nested pciehp_reset_slot # acquires reset_lock pci_reset_hotplug_slot pci_slot_reset # acquires device_lock pci_bus_error_reset aer_root_reset pcie_do_recovery aer_process_err_devices aer_isr pciehp surprise removal: INFO: task irq/26-pciehp:96 blocked for more than 120 seconds. Tainted: G W 6.2.0-rc3-custom-norework-jan11+ schedule_preempt_disabled __mutex_lock mutex_lock_nested pci_dev_set_disconnected # acquires device_lock pci_walk_bus pciehp_unconfigure_device pciehp_disable_slot pciehp_handle_presence_or_link_change pciehp_ist # acquires reset_lock Link: https://bugzilla.kernel.org/show_bug.cgi?id=215590 Fixes: a6bd101b8f84 ("PCI: Unify device inaccessible") Link: https://lore.kernel.org/r/3dc88ea82bdc0e37d9000e413d5ebce481cbd629.1674205689.git.lukas@wunner.de Reported-by: Anatoli Antonovitch <anatoli.antonovitch@amd.com> Signed-off-by: Lukas Wunner <lukas@wunner.de> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Cc: stable@vger.kernel.org # v4.20+ Cc: Keith Busch <kbusch@kernel.org>
Diffstat (limited to 'drivers/pci')
-rw-r--r--drivers/pci/pci.h43
1 files changed, 13 insertions, 30 deletions
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9ed3b5550043..5d5a44aaafe8 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -310,53 +310,36 @@ struct pci_sriov {
* @dev: PCI device to set new error_state
* @new: the state we want dev to be in
*
- * Must be called with device_lock held.
+ * If the device is experiencing perm_failure, it has to remain in that state.
+ * Any other transition is allowed.
*
* Returns true if state has been changed to the requested state.
*/
static inline bool pci_dev_set_io_state(struct pci_dev *dev,
pci_channel_state_t new)
{
- bool changed = false;
+ pci_channel_state_t old;
- device_lock_assert(&dev->dev);
switch (new) {
case pci_channel_io_perm_failure:
- switch (dev->error_state) {
- case pci_channel_io_frozen:
- case pci_channel_io_normal:
- case pci_channel_io_perm_failure:
- changed = true;
- break;
- }
- break;
+ xchg(&dev->error_state, pci_channel_io_perm_failure);
+ return true;
case pci_channel_io_frozen:
- switch (dev->error_state) {
- case pci_channel_io_frozen:
- case pci_channel_io_normal:
- changed = true;
- break;
- }
- break;
+ old = cmpxchg(&dev->error_state, pci_channel_io_normal,
+ pci_channel_io_frozen);
+ return old != pci_channel_io_perm_failure;
case pci_channel_io_normal:
- switch (dev->error_state) {
- case pci_channel_io_frozen:
- case pci_channel_io_normal:
- changed = true;
- break;
- }
- break;
+ old = cmpxchg(&dev->error_state, pci_channel_io_frozen,
+ pci_channel_io_normal);
+ return old != pci_channel_io_perm_failure;
+ default:
+ return false;
}
- if (changed)
- dev->error_state = new;
- return changed;
}
static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
{
- device_lock(&dev->dev);
pci_dev_set_io_state(dev, pci_channel_io_perm_failure);
- device_unlock(&dev->dev);
return 0;
}