summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/mellanox/mlx4/en_tx.c
diff options
context:
space:
mode:
authorMoshe Shemesh <moshe@mellanox.com>2020-12-09 15:03:39 +0200
committerDavid S. Miller <davem@davemloft.net>2020-12-09 16:44:35 -0800
commitba603d9d7b1215c72513d7c7aa02b6775fd4891b (patch)
tree38a412511fb4bde057c1bd7ecd70177abfc09cb6 /drivers/net/ethernet/mellanox/mlx4/en_tx.c
parentfed91613c9dd455dd154b22fa8e11b8526466082 (diff)
net/mlx4_en: Handle TX error CQE
In case error CQE was found while polling TX CQ, the QP is in error state and all posted WQEs will generate error CQEs without any data transmitted. Fix it by reopening the channels, via same method used for TX timeout handling. In addition add some more info on error CQE and WQE for debug. Fixes: bd2f631d7c60 ("net/mlx4_en: Notify user when TX ring in error state") Signed-off-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx4/en_tx.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_tx.c40
1 files changed, 33 insertions, 7 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 3ddb7268e415..59b097cda327 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -392,6 +392,35 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
return cnt;
}
+static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe,
+ u16 cqe_index, struct mlx4_en_tx_ring *ring)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_tx_info *tx_info;
+ struct mlx4_en_tx_desc *tx_desc;
+ u16 wqe_index;
+ int desc_size;
+
+ en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
+ ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome);
+ print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe),
+ false);
+
+ wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask;
+ tx_info = &ring->tx_info[wqe_index];
+ desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE;
+ en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn,
+ wqe_index, desc_size);
+ tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE);
+ print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false);
+
+ if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
+ return;
+
+ en_err(priv, "Scheduling port restart\n");
+ queue_work(mdev->workqueue, &priv->restart_task);
+}
+
int mlx4_en_process_tx_cq(struct net_device *dev,
struct mlx4_en_cq *cq, int napi_budget)
{
@@ -438,13 +467,10 @@ int mlx4_en_process_tx_cq(struct net_device *dev,
dma_rmb();
if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
- MLX4_CQE_OPCODE_ERROR)) {
- struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;
-
- en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
- cqe_err->vendor_err_syndrome,
- cqe_err->syndrome);
- }
+ MLX4_CQE_OPCODE_ERROR))
+ if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state))
+ mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index,
+ ring);
/* Skip over last polled CQE */
new_index = be16_to_cpu(cqe->wqe_index) & size_mask;