diff options
105 files changed, 2698 insertions, 1260 deletions
diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 8e3cce3d0a23..2ad08517e626 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -530,6 +530,77 @@ routines, e.g.::: .... } +Part Ie - IOVA-based DMA mappings +--------------------------------- + +These APIs allow a very efficient mapping when using an IOMMU. They are an +optional path that requires extra code and are only recommended for drivers +where DMA mapping performance, or the space usage for storing the DMA addresses +matter. All the considerations from the previous section apply here as well. + +:: + + bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t size); + +Is used to try to allocate IOVA space for mapping operation. If it returns +false this API can't be used for the given device and the normal streaming +DMA mapping API should be used. The ``struct dma_iova_state`` is allocated +by the driver and must be kept around until unmap time. + +:: + + static inline bool dma_use_iova(struct dma_iova_state *state) + +Can be used by the driver to check if the IOVA-based API is used after a +call to dma_iova_try_alloc. This can be useful in the unmap path. + +:: + + int dma_iova_link(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); + +Is used to link ranges to the IOVA previously allocated. The start of all +but the first call to dma_iova_link for a given state must be aligned +to the DMA merge boundary returned by ``dma_get_merge_boundary())``, and +the size of all but the last range must be aligned to the DMA merge boundary +as well. + +:: + + int dma_iova_sync(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size); + +Must be called to sync the IOMMU page tables for IOVA-range mapped by one or +more calls to ``dma_iova_link()``. + +For drivers that use a one-shot mapping, all ranges can be unmapped and the +IOVA freed by calling: + +:: + + void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, + size_t mapped_len, enum dma_data_direction dir, + unsigned long attrs); + +Alternatively drivers can dynamically manage the IOVA space by unmapping +and mapping individual regions. In that case + +:: + + void dma_iova_unlink(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); + +is used to unmap a range previously mapped, and + +:: + + void dma_iova_free(struct device *dev, struct dma_iova_state *state); + +is used to free the IOVA space. All regions must have been unmapped using +``dma_iova_unlink()`` before calling ``dma_iova_free()``. Part II - Non-coherent DMA allocations -------------------------------------- diff --git a/MAINTAINERS b/MAINTAINERS index dd844ac8d910..4cf394cf4bff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11924,7 +11924,7 @@ F: Documentation/networking/device_drivers/ethernet/intel/ F: drivers/net/ethernet/intel/ F: drivers/net/ethernet/intel/*/ F: include/linux/avf/virtchnl.h -F: include/linux/net/intel/iidc.h +F: include/linux/net/intel/*/ INTEL ETHERNET PROTOCOL DRIVER FOR RDMA M: Mustafa Ismail <mustafa.ismail@intel.com> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 142170473e75..8670e58675c6 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -36,6 +36,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ #define CM_DIRECT_RETRY_CTX ((void *) 1UL) +#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_NO_QP] = "no QP", @@ -167,7 +168,7 @@ struct cm_port { struct cm_device { struct kref kref; struct list_head list; - spinlock_t mad_agent_lock; + rwlock_t mad_agent_lock; struct ib_device *ib_device; u8 ack_delay; int going_down; @@ -241,7 +242,6 @@ struct cm_id_private { u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; - u8 service_timeout; u8 target_ack_delay; struct list_head work_list; @@ -285,7 +285,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) if (!cm_id_priv->av.port) return ERR_PTR(-EINVAL); - spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); mad_agent = cm_id_priv->av.port->mad_agent; if (!mad_agent) { m = ERR_PTR(-EINVAL); @@ -311,7 +311,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) m->ah = ah; out: - spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return m; } @@ -1297,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) if (!cm_id_priv->av.port) return cpu_to_be64(low_tid); - spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); if (cm_id_priv->av.port->mad_agent) hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; - spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return cpu_to_be64(hi_tid | low_tid); } @@ -1872,7 +1872,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, - enum cm_msg_response msg_mraed, u8 service_timeout, + enum cm_msg_response msg_mraed, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); @@ -1881,7 +1881,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg, be32_to_cpu(cm_id_priv->id.remote_id)); - IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout); + IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING); if (private_data && private_data_len) IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data, @@ -1960,7 +1960,7 @@ static void cm_dup_req_handler(struct cm_work *work, switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REQ, cm_id_priv->private_data, cm_id_priv->private_data_len); break; @@ -2454,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work) cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REP, cm_id_priv->private_data, cm_id_priv->private_data_len); else @@ -3094,26 +3094,13 @@ out: return -EINVAL; } -int ib_send_cm_mra(struct ib_cm_id *cm_id, - u8 service_timeout, - const void *private_data, - u8 private_data_len) +int ib_prepare_cm_mra(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; - struct ib_mad_send_buf *msg; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; - enum cm_msg_response msg_response; - void *data; unsigned long flags; - int ret; - - if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) - return -EINVAL; - - data = cm_copy_private_data(private_data, private_data_len); - if (IS_ERR(data)) - return PTR_ERR(data); + int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3122,58 +3109,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; - msg_response = CM_MSG_RESPONSE_OTHER; break; } fallthrough; default: - trace_icm_send_mra_unknown_err(&cm_id_priv->id); + trace_icm_prepare_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; goto error_unlock; } - if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { - msg = cm_alloc_msg(cm_id_priv); - if (IS_ERR(msg)) { - ret = PTR_ERR(msg); - goto error_unlock; - } - - cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - msg_response, service_timeout, - private_data, private_data_len); - trace_icm_send_mra(cm_id); - ret = ib_post_send_mad(msg, NULL); - if (ret) - goto error_free_msg; - } - cm_id->state = cm_state; cm_id->lap_state = lap_state; - cm_id_priv->service_timeout = service_timeout; - cm_set_private_data(cm_id_priv, data, private_data_len); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return 0; + cm_set_private_data(cm_id_priv, NULL, 0); -error_free_msg: - cm_free_msg(msg); error_unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); return ret; } -EXPORT_SYMBOL(ib_send_cm_mra); +EXPORT_SYMBOL(ib_prepare_cm_mra); static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { @@ -3377,7 +3339,6 @@ static int cm_lap_handler(struct cm_work *work) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, - cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); @@ -3786,7 +3747,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv, spin_lock_irq(&cm_id_priv->lock); if (msg != cm_id_priv->msg) { spin_unlock_irq(&cm_id_priv->lock); - cm_free_priv_msg(msg); + cm_free_msg(msg); + cm_deref_id(cm_id_priv); return; } cm_free_priv_msg(msg); @@ -4378,7 +4340,7 @@ static int cm_add_one(struct ib_device *ib_device) return -ENOMEM; kref_init(&cm_dev->kref); - spin_lock_init(&cm_dev->mad_agent_lock); + rwlock_init(&cm_dev->mad_agent_lock); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; @@ -4494,9 +4456,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) * The above ensures no call paths from the work are running, * the remaining paths all take the mad_agent_lock. */ - spin_lock(&cm_dev->mad_agent_lock); + write_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; - spin_unlock(&cm_dev->mad_agent_lock); + write_unlock(&cm_dev->mad_agent_lock); ib_unregister_mad_agent(mad_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h index 944d9071245d..4a4987da69d4 100644 --- a/drivers/infiniband/core/cm_trace.h +++ b/drivers/infiniband/core/cm_trace.h @@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep); DEFINE_CM_ERR_EVENT(dreq_unknown); DEFINE_CM_ERR_EVENT(send_unknown_rej); DEFINE_CM_ERR_EVENT(rej_unknown); -DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(prepare_mra_unknown); DEFINE_CM_ERR_EVENT(mra_unknown); DEFINE_CM_ERR_EVENT(qp_init); DEFINE_CM_ERR_EVENT(qp_rtr); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ab31eefa916b..ce942d300f1e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 -#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP @@ -146,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) } EXPORT_SYMBOL(rdma_iw_cm_id); -/** - * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. - * @res: rdma resource tracking entry pointer - */ -struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) -{ - struct rdma_id_private *id_priv = - container_of(res, struct rdma_id_private, res); - - return &id_priv->id; -} -EXPORT_SYMBOL(rdma_res_to_id); - static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); @@ -2214,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { - trace_cm_send_mra(id_priv); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + trace_cm_prepare_mra(id_priv); + ib_prepare_cm_mra(cm_id); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); @@ -2476,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { - trace_cm_send_mra(cm_id->context); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + trace_cm_prepare_mra(cm_id->context); + ib_prepare_cm_mra(cm_id); } mutex_unlock(&conn_id->handler_mutex); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index dc622f3778be..3456d5f3aa47 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class, DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); -DEFINE_CMA_FSM_EVENT(send_mra); +DEFINE_CMA_FSM_EVENT(prepare_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index f4486cbd8f45..62410578dec3 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -368,12 +368,9 @@ EXPORT_SYMBOL(iw_cm_disconnect); /* * CM_ID <-- DESTROYING * - * Clean up all resources associated with the connection and release - * the initial reference taken by iw_create_cm_id. - * - * Returns true if and only if the last cm_id_priv reference has been dropped. + * Clean up all resources associated with the connection. */ -static bool destroy_cm_id(struct iw_cm_id *cm_id) +static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; @@ -442,20 +439,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id) iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } - - return iwcm_deref_id(cm_id_priv); } /* - * This function is only called by the application thread and cannot - * be called by the event thread. The function will wait for all - * references to be released on the cm_id and then kfree the cm_id - * object. + * Destroy cm_id. If the cm_id still has other references, wait for all + * references to be released on the cm_id and then release the initial + * reference taken by iw_create_cm_id. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { - if (!destroy_cm_id(cm_id)) + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + destroy_cm_id(cm_id); + if (refcount_read(&cm_id_priv->refcount) > 1) flush_workqueue(iwcm_wq); + iwcm_deref_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); @@ -1035,8 +1034,10 @@ static void cm_work_handler(struct work_struct *_work) if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); - if (ret) - WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id)); + if (ret) { + destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 8af0619a39cd..b4b10e8a6495 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) - return (void *) ah; + return ERR_CAST(ah); hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c48ef6083020..c752ae9fad6c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -41,67 +41,72 @@ #include <linux/hugetlb.h> #include <linux/interval_tree.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" -static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, - const struct mmu_interval_notifier_ops *ops) +static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp) { - int ret; + umem_odp->is_implicit_odp = 1; + umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); +} + +static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) +{ + struct ib_device *dev = umem_odp->umem.ibdev; + size_t page_size = 1UL << umem_odp->page_shift; + struct hmm_dma_map *map; + unsigned long start; + unsigned long end; + size_t nr_entries; + int ret = 0; umem_odp->umem.is_odp = 1; mutex_init(&umem_odp->umem_mutex); - if (!umem_odp->is_implicit_odp) { - size_t page_size = 1UL << umem_odp->page_shift; - unsigned long start; - unsigned long end; - size_t ndmas, npfns; - - start = ALIGN_DOWN(umem_odp->umem.address, page_size); - if (check_add_overflow(umem_odp->umem.address, - (unsigned long)umem_odp->umem.length, - &end)) - return -EOVERFLOW; - end = ALIGN(end, page_size); - if (unlikely(end < page_size)) - return -EOVERFLOW; - - ndmas = (end - start) >> umem_odp->page_shift; - if (!ndmas) - return -EINVAL; - - npfns = (end - start) >> PAGE_SHIFT; - umem_odp->pfn_list = kvcalloc( - npfns, sizeof(*umem_odp->pfn_list), - GFP_KERNEL | __GFP_NOWARN); - if (!umem_odp->pfn_list) - return -ENOMEM; - - umem_odp->dma_list = kvcalloc( - ndmas, sizeof(*umem_odp->dma_list), - GFP_KERNEL | __GFP_NOWARN); - if (!umem_odp->dma_list) { + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; + + nr_entries = (end - start) >> PAGE_SHIFT; + if (!(nr_entries * PAGE_SIZE / page_size)) + return -EINVAL; + + map = &umem_odp->map; + if (ib_uses_virt_dma(dev)) { + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) ret = -ENOMEM; - goto out_pfn_list; - } + } else + ret = hmm_dma_map_alloc(dev->dma_device, map, + (end - start) >> PAGE_SHIFT, + 1 << umem_odp->page_shift); + if (ret) + return ret; - ret = mmu_interval_notifier_insert(&umem_odp->notifier, - umem_odp->umem.owning_mm, - start, end - start, ops); - if (ret) - goto out_dma_list; - } + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, start, + end - start, ops); + if (ret) + goto out_free_map; return 0; -out_dma_list: - kvfree(umem_odp->dma_list); -out_pfn_list: - kvfree(umem_odp->pfn_list); +out_free_map: + if (ib_uses_virt_dma(dev)) + kfree(map->pfn_list); + else + hmm_dma_map_free(dev->dma_device, map); return ret; } @@ -120,7 +125,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, { struct ib_umem *umem; struct ib_umem_odp *umem_odp; - int ret; if (access & IB_ACCESS_HUGETLB) return ERR_PTR(-EINVAL); @@ -132,16 +136,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, umem->ibdev = device; umem->writable = ib_access_writable(access); umem->owning_mm = current->mm; - umem_odp->is_implicit_odp = 1; umem_odp->page_shift = PAGE_SHIFT; umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - ret = ib_init_umem_odp(umem_odp, NULL); - if (ret) { - put_pid(umem_odp->tgid); - kfree(umem_odp); - return ERR_PTR(ret); - } + ib_init_umem_implicit_odp(umem_odp); return umem_odp; } EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); @@ -262,74 +260,41 @@ err_put_pid: } EXPORT_SYMBOL(ib_umem_odp_get); -void ib_umem_odp_release(struct ib_umem_odp *umem_odp) +static void ib_umem_odp_free(struct ib_umem_odp *umem_odp) { + struct ib_device *dev = umem_odp->umem.ibdev; + /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - if (!umem_odp->is_implicit_odp) { - mutex_lock(&umem_odp->umem_mutex); - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - mutex_unlock(&umem_odp->umem_mutex); - mmu_interval_notifier_remove(&umem_odp->notifier); - kvfree(umem_odp->dma_list); - kvfree(umem_odp->pfn_list); - } - put_pid(umem_odp->tgid); - kfree(umem_odp); + mutex_lock(&umem_odp->umem_mutex); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); + mmu_interval_notifier_remove(&umem_odp->notifier); + if (ib_uses_virt_dma(dev)) + kfree(umem_odp->map.pfn_list); + else + hmm_dma_map_free(dev->dma_device, &umem_odp->map); } -EXPORT_SYMBOL(ib_umem_odp_release); -/* - * Map for DMA and insert a single page into the on-demand paging page tables. - * - * @umem: the umem to insert the page to. - * @dma_index: index in the umem to add the dma to. - * @page: the page struct to map and add. - * @access_mask: access permissions needed for this page. - * - * The function returns -EFAULT if the DMA mapping operation fails. - * - */ -static int ib_umem_odp_map_dma_single_page( - struct ib_umem_odp *umem_odp, - unsigned int dma_index, - struct page *page, - u64 access_mask) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - - if (*dma_addr) { - /* - * If the page is already dma mapped it means it went through - * a non-invalidating trasition, like read-only to writable. - * Resync the flags. - */ - *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; - return 0; - } + if (!umem_odp->is_implicit_odp) + ib_umem_odp_free(umem_odp); - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, *dma_addr)) { - *dma_addr = 0; - return -EFAULT; - } - umem_odp->npages++; - *dma_addr |= access_mask; - return 0; + put_pid(umem_odp->tgid); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /** * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * * Maps the range passed in the argument to DMA addresses. - * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. * Upon success the ODP MR will be locked to let caller complete its device * page table update. * @@ -357,9 +322,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, struct hmm_range range = {}; unsigned long timeout; - if (access_mask == 0) - return -EINVAL; - if (user_virt < ib_umem_start(umem_odp) || user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; @@ -385,11 +347,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, if (fault) { range.default_flags = HMM_PFN_REQ_FAULT; - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & HMM_PFN_WRITE) range.default_flags |= HMM_PFN_REQ_WRITE; } - range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); + range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: @@ -417,22 +379,17 @@ retry: for (pfn_index = 0; pfn_index < num_pfns; pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { - if (fault) { - /* - * Since we asked for hmm_range_fault() to populate - * pages it shouldn't return an error entry on success. - */ - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); - } else { - if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { - WARN_ON(umem_odp->dma_list[dma_index]); - continue; - } - access_mask = ODP_READ_ALLOWED_BIT; - if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) - access_mask |= ODP_WRITE_ALLOWED_BIT; - } + /* + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. + */ + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) + continue; + + if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) + continue; hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); /* If a hugepage was detected and ODP wasn't set for, the umem @@ -445,15 +402,6 @@ retry: __func__, hmm_order, page_shift); break; } - - ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), - access_mask); - if (ret < 0) { - ibdev_dbg(umem_odp->umem.ibdev, - "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -473,45 +421,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma_addr; - dma_addr_t dma; - int idx; - u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; + u64 addr; lockdep_assert_held(&umem_odp->umem_mutex); virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { - idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - dma = umem_odp->dma_list[idx]; - - /* The access flags guaranteed a valid DMA address in case was NULL */ - if (dma) { - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; - struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); - - dma_addr = dma & ODP_DMA_ADDR_MASK; - ib_dma_unmap_page(dev, dma_addr, - BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (dma & ODP_WRITE_ALLOWED_BIT) { - struct page *head_page = compound_head(page); - /* - * set_page_dirty prefers being called with - * the page lock. However, MMU notifiers are - * called sometimes with and sometimes without - * the lock. We rely on the umem_mutex instead - * to prevent other mmu notifiers from - * continuing and allowing the page mapping to - * be removed. - */ - set_page_dirty(head_page); - } - umem_odp->dma_list[idx] = 0; - umem_odp->npages--; + u64 offset = addr - ib_umem_start(umem_odp); + size_t idx = offset >> umem_odp->page_shift; + unsigned long pfn = umem_odp->map.pfn_list[idx]; + + if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) + goto clear; + + if (pfn & HMM_PFN_WRITE) { + struct page *page = hmm_pfn_to_page(pfn); + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); } + umem_odp->npages--; +clear: + umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; } } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3c3bb670c805..bc9fe3ceca4d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -193,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs) fd, attrs); if (IS_ERR(uobj)) - return (void *)uobj; + return ERR_CAST(uobj); uverbs_uobject_get(uobj); uobj_put_read(uobj); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c5e78bbefbd0..75fde0fe9989 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, GFP_KERNEL : GFP_ATOMIC); if (IS_ERR(slave)) { rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); - return (void *)slave; + return ERR_CAST(slave); } ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); rdma_lag_put_ah_roce_slave(slave); diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index af91d16c3c77..e632f1661b92 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -170,6 +170,9 @@ static int map_cc_config_offset_gen0_ext0(u32 offset, struct bnxt_qplib_cc_param case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: *val = ccparam->tcp_cp; break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP: + *val = ccparam->inact_th; + break; default: return -EINVAL; } @@ -203,7 +206,7 @@ static ssize_t bnxt_re_cc_config_get(struct file *filp, char __user *buffer, return simple_read_from_buffer(buffer, usr_buf_len, ppos, (u8 *)(buf), rc); } -static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val) +static int bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val) { u32 modify_mask; @@ -247,7 +250,9 @@ static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offs ccparam->tcp_cp = val; break; case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TX_QUEUE: + return -EOPNOTSUPP; case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP: + ccparam->inact_th = val; break; case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TIME_PER_PHASE: ccparam->time_pph = val; @@ -258,17 +263,20 @@ static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offs } ccparam->mask = modify_mask; + return 0; } static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offset, u32 val) { struct bnxt_qplib_cc_param ccparam = { }; + int rc; - /* Supporting only Gen 0 now */ - if (gen_ext == CC_CONFIG_GEN0_EXT0) - bnxt_re_fill_gen0_ext0(&ccparam, offset, val); - else - return -EINVAL; + if (gen_ext != CC_CONFIG_GEN0_EXT0) + return -EOPNOTSUPP; + + rc = bnxt_re_fill_gen0_ext0(&ccparam, offset, val); + if (rc) + return rc; bnxt_qplib_modify_cc(&rdev->qplib_res, &ccparam); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 457eecb99f96..be34c605d516 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1113,7 +1113,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION; if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED; - if (_is_ext_stats_supported(res->dattr->dev_cap_flags) && !res->is_vf) + if (bnxt_ext_stats_supported(res->cctx, res->dattr->dev_cap_flags, res->is_vf)) qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED; req.qp_flags = cpu_to_le32(qp_flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index f231e886ad9d..9efd32a3dc55 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -846,7 +846,12 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS; req.resp_addr = cpu_to_le64(sbuf.dma_addr); - req.function_id = cpu_to_le32(fid); + if (bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx) && rcfw->res->is_vf) + req.function_id = + cpu_to_le32(CMDQ_QUERY_ROCE_STATS_EXT_VF_VALID | + (fid << CMDQ_QUERY_ROCE_STATS_EXT_VF_NUM_SFT)); + else + req.function_id = cpu_to_le32(fid); req.flags = cpu_to_le16(CMDQ_QUERY_ROCE_STATS_EXT_FLAGS_FUNCTION_ID); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index b6e3141253c4..d6dde762921a 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -124,7 +124,6 @@ struct opa_mad_notice_attr { } __packed ntc_2048; }; - u8 class_data[]; }; #define IB_VLARB_LOWPRI_0_31 1 diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 5a91cbda4aee..764286da2ce8 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1361,16 +1361,6 @@ void sc_flush(struct send_context *sc) sc_wait_for_packet_egress(sc, 1); } -/* drop all packets on the context, no waiting until they are sent */ -void sc_drop(struct send_context *sc) -{ - if (!sc) - return; - - dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", - __func__, sc->sw_index, sc->hw_context); -} - /* * Start the software reaction to a context halt or SPC freeze: * - mark the context as halted or frozen diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h index d07cc6ea7c63..ab0f9a3a8d12 100644 --- a/drivers/infiniband/hw/hfi1/pio.h +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -246,7 +246,6 @@ void sc_disable(struct send_context *sc); int sc_restart(struct send_context *sc); void sc_return_credits(struct send_context *sc); void sc_flush(struct send_context *sc); -void sc_drop(struct send_context *sc); void sc_stop(struct send_context *sc, int bit); struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, pio_release_cb cb, void *arg); diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 0d2b39b7c8b5..16a749d16ee9 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1521,24 +1521,6 @@ void sdma_all_running(struct hfi1_devdata *dd) } /** - * sdma_all_idle() - called when the link goes down - * @dd: hfi1_devdata - * - * This routine moves all engines to the idle state. - */ -void sdma_all_idle(struct hfi1_devdata *dd) -{ - struct sdma_engine *sde; - unsigned int i; - - /* idle all engines */ - for (i = 0; i < dd->num_sdma; ++i) { - sde = &dd->per_sdma[i]; - sdma_process_event(sde, sdma_event_e70_go_idle); - } -} - -/** * sdma_start() - called to kick off state processing for all engines * @dd: hfi1_devdata * diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d77246b48434..91dfd5d0c419 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -373,7 +373,6 @@ void sdma_start(struct hfi1_devdata *dd); void sdma_exit(struct hfi1_devdata *dd); void sdma_clean(struct hfi1_devdata *dd, size_t num_engines); void sdma_all_running(struct hfi1_devdata *dd); -void sdma_all_idle(struct hfi1_devdata *dd); void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); void sdma_freeze(struct hfi1_devdata *dd); void sdma_unfreeze(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index cf2d29098406..62b4f16dab27 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -53,7 +53,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, int ret = 0; fd->entry_to_rb = kcalloc(uctxt->expected_count, - sizeof(struct rb_node *), + sizeof(*fd->entry_to_rb), GFP_KERNEL); if (!fd->entry_to_rb) return -ENOMEM; diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index 7917af8e6380..baf592e6f21b 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,6 +4,7 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(src) hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 4fc5b9d5fea8..307c35888b30 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -33,7 +33,6 @@ #include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> -#include "hnae3.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 560a1d9de408..1dcc9cbb4678 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1027,6 +1027,26 @@ struct hns_roce_dev { atomic64_t *dfx_cnt; }; +enum hns_roce_trace_type { + TRACE_SQ, + TRACE_RQ, + TRACE_SRQ, +}; + +static inline const char *trace_type_to_str(enum hns_roce_trace_type type) +{ + switch (type) { + case TRACE_SQ: + return "SQ"; + case TRACE_RQ: + return "RQ"; + case TRACE_SRQ: + return "SRQ"; + default: + return "UNKNOWN"; + } +} + static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) { return container_of(ib_dev, struct hns_roce_dev, ib_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 160e8927d364..fa8747656f25 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -43,13 +43,15 @@ #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#define CREATE_TRACE_POINTS +#include "hns_roce_trace.h" + enum { CMD_RST_PRC_OTHERS, CMD_RST_PRC_SUCCESS, @@ -738,6 +740,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, else ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit); + trace_hns_sq_wqe(qp->qpn, wqe_idx, wqe, 1 << qp->sq.wqe_shift, + wr->wr_id, TRACE_SQ); if (unlikely(ret)) { *bad_wr = wr; goto out; @@ -807,6 +811,9 @@ static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr, wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx); fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge); + + trace_hns_rq_wqe(hr_qp->qpn, wqe_idx, wqe, 1 << hr_qp->rq.wqe_shift, + wr->wr_id, TRACE_RQ); } static int hns_roce_v2_post_recv(struct ib_qp *ibqp, @@ -943,7 +950,7 @@ static void fill_wqe_idx(struct hns_roce_srq *srq, unsigned int wqe_idx) static void update_srq_db(struct hns_roce_srq *srq) { struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device); - struct hns_roce_v2_db db; + struct hns_roce_v2_db db = {}; hr_reg_write(&db, DB_TAG, srq->srqn); hr_reg_write(&db, DB_CMD, HNS_ROCE_V2_SRQ_DB); @@ -984,6 +991,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, fill_recv_sge_to_wqe(wr, wqe, max_sge, srq->rsv_sge); fill_wqe_idx(srq, wqe_idx); srq->wrid[wqe_idx] = wr->wr_id; + + trace_hns_srq_wqe(srq->srqn, wqe_idx, wqe, 1 << srq->wqe_shift, + wr->wr_id, TRACE_SRQ); } if (likely(nreq)) { @@ -1311,6 +1321,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, tail = csq->head; for (i = 0; i < num; i++) { + trace_hns_cmdq_req(hr_dev, &desc[i]); + csq->desc[csq->head++] = desc[i]; if (csq->head == csq->desc_num) csq->head = 0; @@ -1325,6 +1337,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { + trace_hns_cmdq_resp(hr_dev, &csq->desc[tail]); + /* check the result of hardware write back */ desc_ret = le16_to_cpu(csq->desc[tail++].retval); if (tail == csq->desc_num) @@ -4302,8 +4316,7 @@ static inline int get_pdn(struct ib_pd *ib_pd) } static void modify_qp_reset_to_init(struct ib_qp *ibqp, - struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *context) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); @@ -5122,7 +5135,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { memset(qpc_mask, 0, hr_dev->caps.qpc_sz); - modify_qp_reset_to_init(ibqp, context, qpc_mask); + modify_qp_reset_to_init(ibqp, context); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { modify_qp_init_to_init(ibqp, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { @@ -5313,6 +5326,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, return; spin_lock_irqsave(&hr_qp->sq.lock, sq_flag); + trace_hns_sq_flush_cqe(hr_qp->qpn, hr_qp->sq.head, TRACE_SQ); hr_reg_write(context, QPC_SQ_PRODUCER_IDX, hr_qp->sq.head); hr_reg_clear(qpc_mask, QPC_SQ_PRODUCER_IDX); hr_qp->state = IB_QPS_ERR; @@ -5322,6 +5336,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, return; spin_lock_irqsave(&hr_qp->rq.lock, rq_flag); + trace_hns_rq_flush_cqe(hr_qp->qpn, hr_qp->rq.head, TRACE_RQ); hr_reg_write(context, QPC_RQ_PRODUCER_IDX, hr_qp->rq.head); hr_reg_clear(qpc_mask, QPC_RQ_PRODUCER_IDX); spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag); @@ -6248,6 +6263,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->sub_type = sub_type; ++eq->cons_index; aeqe_found = IRQ_HANDLED; + trace_hns_ae_info(event_type, aeqe, eq->eqe_size); atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_AEQE_CNT]); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 91a5665465ff..bc7466830eaf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -34,6 +34,7 @@ #define _HNS_ROCE_HW_V2_H #include <linux/bitops.h> +#include "hnae3.h" #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_V2_MTT_ENTRY_SZ 64 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 8d0b63d4b50a..e7a497cc125c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,7 +37,6 @@ #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_cache.h> -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 09da3496843b..93a48b41955b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -38,6 +38,7 @@ #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" +#include "hns_roce_trace.h" static u32 hw_index_to_key(int ind) { @@ -159,6 +160,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, if (IS_ERR(mailbox)) return PTR_ERR(mailbox); + trace_hns_mr(mr); if (mr->type != MR_TYPE_FRMR) ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr); else @@ -1146,6 +1148,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, struct ib_device *ibdev = &hr_dev->ib_dev; int ret; + trace_hns_buf_attr(buf_attr); /* The caller has its own buffer list and invokes the hns_roce_mtr_map() * to finish the MTT configuration. */ diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 356d98816949..f637b73b946e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -4,7 +4,6 @@ #include <rdma/rdma_cm.h> #include <rdma/restrack.h> #include <uapi/rdma/rdma_netlink.h> -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h new file mode 100644 index 000000000000..59ceb591b3a1 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hns_roce + +#if !defined(__HNS_ROCE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HNS_ROCE_TRACE_H + +#include <linux/tracepoint.h> +#include <linux/string_choices.h> +#include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" + +DECLARE_EVENT_CLASS(flush_head_template, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type), + + TP_STRUCT__entry(__field(unsigned long, qpn) + __field(u32, pi) + __field(enum hns_roce_trace_type, type) + ), + + TP_fast_assign(__entry->qpn = qpn; + __entry->pi = pi; + __entry->type = type; + ), + + TP_printk("%s 0x%lx flush head 0x%x.", + trace_type_to_str(__entry->type), + __entry->qpn, __entry->pi) +); + +DEFINE_EVENT(flush_head_template, hns_sq_flush_cqe, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type)); +DEFINE_EVENT(flush_head_template, hns_rq_flush_cqe, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type)); + +#define MAX_SGE_PER_WQE 64 +#define MAX_WQE_SIZE (MAX_SGE_PER_WQE * HNS_ROCE_SGE_SIZE) +DECLARE_EVENT_CLASS(wqe_template, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, + u64 id, enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type), + + TP_STRUCT__entry(__field(unsigned long, qpn) + __field(u32, idx) + __array(u32, wqe, + MAX_WQE_SIZE / sizeof(__le32)) + __field(u32, len) + __field(u64, id) + __field(enum hns_roce_trace_type, type) + ), + + TP_fast_assign(__entry->qpn = qpn; + __entry->idx = idx; + __entry->id = id; + __entry->len = len / sizeof(__le32); + __entry->type = type; + for (int i = 0; i < __entry->len; i++) + __entry->wqe[i] = le32_to_cpu(((__le32 *)wqe)[i]); + ), + + TP_printk("%s 0x%lx wqe(0x%x/0x%llx): %s", + trace_type_to_str(__entry->type), + __entry->qpn, __entry->idx, __entry->id, + __print_array(__entry->wqe, __entry->len, + sizeof(__le32))) +); + +DEFINE_EVENT(wqe_template, hns_sq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); +DEFINE_EVENT(wqe_template, hns_rq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); +DEFINE_EVENT(wqe_template, hns_srq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); + +TRACE_EVENT(hns_ae_info, + TP_PROTO(int event_type, void *aeqe, unsigned int len), + TP_ARGS(event_type, aeqe, len), + + TP_STRUCT__entry(__field(int, event_type) + __array(u32, aeqe, + HNS_ROCE_V3_EQE_SIZE / sizeof(__le32)) + __field(u32, len) + ), + + TP_fast_assign(__entry->event_type = event_type; + __entry->len = len / sizeof(__le32); + for (int i = 0; i < __entry->len; i++) + __entry->aeqe[i] = le32_to_cpu(((__le32 *)aeqe)[i]); + ), + + TP_printk("event %2d aeqe: %s", __entry->event_type, + __print_array(__entry->aeqe, __entry->len, sizeof(__le32))) +); + +TRACE_EVENT(hns_mr, + TP_PROTO(struct hns_roce_mr *mr), + TP_ARGS(mr), + + TP_STRUCT__entry(__field(u64, iova) + __field(u64, size) + __field(u32, key) + __field(u32, pd) + __field(u32, pbl_hop_num) + __field(u32, npages) + __field(int, type) + __field(int, enabled) + ), + + TP_fast_assign(__entry->iova = mr->iova; + __entry->size = mr->size; + __entry->key = mr->key; + __entry->pd = mr->pd; + __entry->pbl_hop_num = mr->pbl_hop_num; + __entry->npages = mr->npages; + __entry->type = mr->type; + __entry->enabled = mr->enabled; + ), + + TP_printk("iova:0x%llx, size:%llu, key:%u, pd:%u, pbl_hop:%u, npages:%u, type:%d, status:%d", + __entry->iova, __entry->size, __entry->key, + __entry->pd, __entry->pbl_hop_num, __entry->npages, + __entry->type, __entry->enabled) +); + +TRACE_EVENT(hns_buf_attr, + TP_PROTO(struct hns_roce_buf_attr *attr), + TP_ARGS(attr), + + TP_STRUCT__entry(__field(unsigned int, region_count) + __field(unsigned int, region0_size) + __field(int, region0_hopnum) + __field(unsigned int, region1_size) + __field(int, region1_hopnum) + __field(unsigned int, region2_size) + __field(int, region2_hopnum) + __field(unsigned int, page_shift) + __field(bool, mtt_only) + ), + + TP_fast_assign(__entry->region_count = attr->region_count; + __entry->region0_size = attr->region[0].size; + __entry->region0_hopnum = attr->region[0].hopnum; + __entry->region1_size = attr->region[1].size; + __entry->region1_hopnum = attr->region[1].hopnum; + __entry->region2_size = attr->region[2].size; + __entry->region2_hopnum = attr->region[2].hopnum; + __entry->page_shift = attr->page_shift; + __entry->mtt_only = attr->mtt_only; + ), + + TP_printk("rg cnt:%u, pg_sft:0x%x, mtt_only:%s, rg 0 (sz:%u, hop:%u), rg 1 (sz:%u, hop:%u), rg 2 (sz:%u, hop:%u)\n", + __entry->region_count, __entry->page_shift, + str_yes_no(__entry->mtt_only), + __entry->region0_size, __entry->region0_hopnum, + __entry->region1_size, __entry->region1_hopnum, + __entry->region2_size, __entry->region2_hopnum) +); + +DECLARE_EVENT_CLASS(cmdq, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc), + + TP_STRUCT__entry(__string(dev_name, dev_name(hr_dev->dev)) + __field(u16, opcode) + __field(u16, flag) + __field(u16, retval) + __array(u32, data, 6) + ), + + TP_fast_assign(__assign_str(dev_name); + __entry->opcode = le16_to_cpu(desc->opcode); + __entry->flag = le16_to_cpu(desc->flag); + __entry->retval = le16_to_cpu(desc->retval); + for (int i = 0; i < 6; i++) + __entry->data[i] = le32_to_cpu(desc->data[i]); + ), + + TP_printk("%s cmdq opcode:0x%x, flag:0x%x, retval:0x%x, data:%s\n", + __get_str(dev_name), __entry->opcode, + __entry->flag, __entry->retval, + __print_array(__entry->data, 6, sizeof(__le32))) +); + +DEFINE_EVENT(cmdq, hns_cmdq_req, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc)); +DEFINE_EVENT(cmdq, hns_cmdq_resp, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc)); + +#endif /* __HNS_ROCE_TRACE_H */ + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hns_roce_trace +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 6aed6169c07d..99a7f1a6c0b5 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -3131,7 +3131,7 @@ int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]); ibdev_dbg(to_ibdev(cqp->dev), - "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%pK] cqp[%p] polarity[x%04x]\n", + "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%p] cqp[%p] polarity[x%04x]\n", cqp->sq_size, cqp->hw_sq_size, cqp->sq_base, (u64 *)(uintptr_t)cqp->sq_pa, cqp, cqp->polarity); return 0; diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 7599e31b5743..1e840bbd619d 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -1,10 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "main.h" -#include "../../../net/ethernet/intel/ice/ice.h" MODULE_ALIAS("i40iw"); -MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>"); MODULE_DESCRIPTION("Intel(R) Ethernet Protocol Driver for RDMA"); MODULE_LICENSE("Dual BSD/GPL"); @@ -61,7 +59,7 @@ static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) } static void irdma_fill_qos_info(struct irdma_l2params *l2params, - struct iidc_qos_params *qos_info) + struct iidc_rdma_qos_params *qos_info) { int i; @@ -85,12 +83,13 @@ static void irdma_fill_qos_info(struct irdma_l2params *l2params, } } -static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event) +static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info, + struct iidc_rdma_event *event) { - struct irdma_device *iwdev = dev_get_drvdata(&pf->adev->dev); + struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev); struct irdma_l2params l2params = {}; - if (*event->type & BIT(IIDC_EVENT_AFTER_MTU_CHANGE)) { + if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { ibdev_dbg(&iwdev->ibdev, "CLNT: new MTU = %d\n", iwdev->netdev->mtu); if (iwdev->vsi.mtu != iwdev->netdev->mtu) { l2params.mtu = iwdev->netdev->mtu; @@ -98,25 +97,26 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); irdma_change_l2params(&iwdev->vsi, &l2params); } - } else if (*event->type & BIT(IIDC_EVENT_BEFORE_TC_CHANGE)) { + } else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) { if (iwdev->vsi.tc_change_pending) return; irdma_prep_tc_change(iwdev); - } else if (*event->type & BIT(IIDC_EVENT_AFTER_TC_CHANGE)) { - struct iidc_qos_params qos_info = {}; + } else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) { + struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; if (!iwdev->vsi.tc_change_pending) return; l2params.tc_changed = true; ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n"); - ice_get_qos_params(pf, &qos_info); - irdma_fill_qos_info(&l2params, &qos_info); + + irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = qos_info.num_tc > 1 && !l2params.dscp_mode; + iwdev->dcb_vlan_mode = + l2params.num_tc > 1 && !l2params.dscp_mode; irdma_change_l2params(&iwdev->vsi, &l2params); - } else if (*event->type & BIT(IIDC_EVENT_CRIT_ERR)) { + } else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) { ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", event->reg); if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) { @@ -151,10 +151,8 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event */ static void irdma_request_reset(struct irdma_pci_f *rf) { - struct ice_pf *pf = rf->cdev; - ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n"); - ice_rdma_request_reset(pf, IIDC_PFR); + ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET); } /** @@ -166,14 +164,15 @@ static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; - struct ice_pf *pf = iwdev->rf->cdev; + struct iidc_rdma_core_dev_info *cdev_info; struct iidc_rdma_qset_params qset = {}; int ret; + cdev_info = iwdev->rf->cdev; qset.qs_handle = tc_node->qs_handle; qset.tc = tc_node->traffic_class; qset.vport_id = vsi->vsi_idx; - ret = ice_add_rdma_qset(pf, &qset); + ret = ice_add_rdma_qset(cdev_info, &qset); if (ret) { ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); return ret; @@ -194,19 +193,20 @@ static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; - struct ice_pf *pf = iwdev->rf->cdev; + struct iidc_rdma_core_dev_info *cdev_info; struct iidc_rdma_qset_params qset = {}; + cdev_info = iwdev->rf->cdev; qset.qs_handle = tc_node->qs_handle; qset.tc = tc_node->traffic_class; qset.vport_id = vsi->vsi_idx; qset.teid = tc_node->l2_sched_node_id; - if (ice_del_rdma_qset(pf, &qset)) + if (ice_del_rdma_qset(cdev_info, &qset)) ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); } -static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +static int irdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) { int i; @@ -217,12 +217,12 @@ static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) return -ENOMEM; for (i = 0; i < rf->msix_count; i++) - if (ice_alloc_rdma_qvector(pf, &rf->msix_entries[i])) + if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i])) break; if (i < IRDMA_MIN_MSIX) { while (--i >= 0) - ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); kfree(rf->msix_entries); return -ENOMEM; @@ -233,56 +233,65 @@ static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) return 0; } -static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) { int i; for (i = 0; i < rf->msix_count; i++) - ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); kfree(rf->msix_entries); } static void irdma_remove(struct auxiliary_device *aux_dev) { - struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, - struct iidc_auxiliary_dev, - adev); - struct ice_pf *pf = iidc_adev->pf; struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct iidc_rdma_core_dev_info *cdev_info; + + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + cdev_info = iidc_adev->cdev_info; + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); irdma_ib_unregister_device(iwdev); - ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); - irdma_deinit_interrupts(iwdev->rf, pf); + irdma_deinit_interrupts(iwdev->rf, cdev_info); kfree(iwdev->rf); - pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn)); + pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(cdev_info->pdev->devfn)); } -static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf, - struct ice_vsi *vsi) +static void irdma_fill_device_info(struct irdma_device *iwdev, + struct iidc_rdma_core_dev_info *cdev_info) { + struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; struct irdma_pci_f *rf = iwdev->rf; - rf->cdev = pf; + rf->sc_dev.hw = &rf->hw; + rf->iwdev = iwdev; + rf->cdev = cdev_info; + rf->hw.hw_addr = iidc_priv->hw_addr; + rf->pcidev = cdev_info->pdev; + rf->hw.device = &rf->pcidev->dev; + rf->pf_id = iidc_priv->pf_id; rf->gen_ops.register_qset = irdma_lan_register_qset; rf->gen_ops.unregister_qset = irdma_lan_unregister_qset; - rf->hw.hw_addr = pf->hw.hw_addr; - rf->pcidev = pf->pdev; - rf->pf_id = pf->hw.pf_id; - rf->default_vsi.vsi_idx = vsi->vsi_num; - rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? - IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; + + rf->default_vsi.vsi_idx = iidc_priv->vport_id; + rf->protocol_used = + cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ? + IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; rf->rdma_ver = IRDMA_GEN_2; rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; rf->rst_to = IRDMA_RST_TIMEOUT_HZ; rf->gen_ops.request_reset = irdma_request_reset; rf->limits_sel = 7; rf->iwdev = iwdev; + mutex_init(&iwdev->ah_tbl_lock); - iwdev->netdev = vsi->netdev; - iwdev->vsi_num = vsi->vsi_num; + + iwdev->netdev = iidc_priv->netdev; + iwdev->vsi_num = iidc_priv->vport_id; iwdev->init_state = INITIAL_STATE; iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; @@ -294,19 +303,18 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id) { - struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, - struct iidc_auxiliary_dev, - adev); - struct ice_pf *pf = iidc_adev->pf; - struct ice_vsi *vsi = ice_get_main_vsi(pf); - struct iidc_qos_params qos_info = {}; + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct iidc_rdma_core_dev_info *cdev_info; + struct iidc_rdma_priv_dev_info *iidc_priv; + struct irdma_l2params l2params = {}; struct irdma_device *iwdev; struct irdma_pci_f *rf; - struct irdma_l2params l2params = {}; int err; - if (!vsi) - return -EIO; + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + cdev_info = iidc_adev->cdev_info; + iidc_priv = cdev_info->iidc_priv; + iwdev = ib_alloc_device(irdma_device, ibdev); if (!iwdev) return -ENOMEM; @@ -316,10 +324,10 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ return -ENOMEM; } - irdma_fill_device_info(iwdev, pf, vsi); + irdma_fill_device_info(iwdev, cdev_info); rf = iwdev->rf; - err = irdma_init_interrupts(rf, pf); + err = irdma_init_interrupts(rf, cdev_info); if (err) goto err_init_interrupts; @@ -328,8 +336,7 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ goto err_ctrl_init; l2params.mtu = iwdev->netdev->mtu; - ice_get_qos_params(pf, &qos_info); - irdma_fill_qos_info(&l2params, &qos_info); + irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; @@ -341,7 +348,7 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ if (err) goto err_ibreg; - ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, true); + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true); ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn)); auxiliary_set_drvdata(aux_dev, iwdev); @@ -353,7 +360,7 @@ err_ibreg: err_rt_init: irdma_ctrl_deinit_hw(rf); err_ctrl_init: - irdma_deinit_interrupts(rf, pf); + irdma_deinit_interrupts(rf, cdev_info); err_init_interrupts: kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); @@ -369,7 +376,7 @@ static const struct auxiliary_device_id irdma_auxiliary_id_table[] = { MODULE_DEVICE_TABLE(auxiliary, irdma_auxiliary_id_table); -static struct iidc_auxiliary_drv irdma_auxiliary_drv = { +static struct iidc_rdma_core_auxiliary_drv irdma_auxiliary_drv = { .adrv = { .id_table = irdma_auxiliary_id_table, .probe = irdma_probe, diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index bb0b6494ccb2..674acc952168 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -29,7 +29,8 @@ #include <linux/io-64-nonatomic-lo-hi.h> #endif #include <linux/auxiliary_bus.h> -#include <linux/net/intel/iidc.h> +#include <linux/net/intel/iidc_rdma.h> +#include <linux/net/intel/iidc_rdma_ice.h> #include <rdma/ib_smi.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index 4b4f78288d12..3f73ceacccb6 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -5,8 +5,8 @@ #include <linux/pci.h> #include <linux/bitfield.h> -#include <linux/net/intel/iidc.h> #include <rdma/ib_verbs.h> +#include <net/dscp.h> #define STATS_TIMER_DELAY 60000 diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index e7ce6840755f..37ce35cb10e7 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -108,7 +108,7 @@ static int add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc, chunk->vaddr = sd_entry->u.bp.addr.va + offset; chunk->fpm_addr = pble_rsrc->next_fpm_addr; ibdev_dbg(to_ibdev(dev), - "PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%pK fpm_addr = %llx\n", + "PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%p fpm_addr = %llx\n", chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr); return 0; diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 59b34afa867b..527c6da2c1ac 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -567,7 +567,7 @@ struct irdma_sc_vsi { u8 qos_rel_bw; u8 qos_prio_type; u8 stats_idx; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; u64 hw_stats_regs[IRDMA_HW_STAT_INDEX_MAX_GEN_1]; bool dscp_mode:1; @@ -695,7 +695,7 @@ struct irdma_l2params { u16 qs_handle_list[IRDMA_MAX_USER_PRIORITY]; u16 mtu; u8 up2tc[IRDMA_MAX_USER_PRIORITY]; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; u8 num_tc; u8 vsi_rel_bw; u8 vsi_prio_type; diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 0fc4e2679218..28e154bbb50f 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -15,14 +15,12 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; - struct gdma_context *gc; bool is_rnic_cq; u32 doorbell; u32 buf_size; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev_to_gc(mdev); cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; cq->cq_handle = INVALID_MANA_HANDLE; @@ -65,7 +63,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err); return err; } - doorbell = gc->mana_ib.doorbell; + doorbell = mdev->gdma_dev->doorbell; } if (is_rnic_cq) { diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index b31089320aa5..165c0a1e67d1 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -101,103 +101,95 @@ static int mana_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { struct mana_adev *madev = container_of(adev, struct mana_adev, adev); + struct gdma_context *gc = madev->mdev->gdma_context; + struct mana_context *mc = gc->mana.driver_data; struct gdma_dev *mdev = madev->mdev; struct net_device *ndev; - struct mana_context *mc; struct mana_ib_dev *dev; u8 mac_addr[ETH_ALEN]; int ret; - mc = mdev->driver_data; - dev = ib_alloc_device(mana_ib_dev, ib_dev); if (!dev) return -ENOMEM; ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops); - - dev->ib_dev.phys_port_cnt = mc->num_ports; - - ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, - mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); - dev->ib_dev.node_type = RDMA_NODE_IB_CA; - - /* - * num_comp_vectors needs to set to the max MSIX index - * when interrupts and event queues are implemented - */ - dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; - dev->ib_dev.dev.parent = mdev->gdma_context->dev; - - ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); - if (!ndev) { - ret = -ENODEV; - ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); - goto free_ib_device; - } - ether_addr_copy(mac_addr, ndev->dev_addr); - addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); - ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); - /* mana_get_primary_netdev() returns ndev with refcount held */ - netdev_put(ndev, &dev->dev_tracker); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); - goto free_ib_device; - } - - ret = mana_gd_register_device(&mdev->gdma_context->mana_ib); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to register device, ret %d", - ret); - goto free_ib_device; - } - dev->gdma_dev = &mdev->gdma_context->mana_ib; - - dev->nb.notifier_call = mana_ib_netdev_event; - ret = register_netdevice_notifier(&dev->nb); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d", - ret); - goto deregister_device; - } - - ret = mana_ib_gd_query_adapter_caps(dev); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", - ret); - goto deregister_net_notifier; - } - - ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); - - ret = mana_ib_create_eqs(dev); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); - goto deregister_net_notifier; - } - - ret = mana_ib_gd_create_rnic_adapter(dev); - if (ret) - goto destroy_eqs; - + dev->ib_dev.num_comp_vectors = gc->max_num_queues; + dev->ib_dev.dev.parent = gc->dev; + dev->gdma_dev = mdev; xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ); - ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", - ret); - goto destroy_rnic; + + if (mana_ib_is_rnic(dev)) { + dev->ib_dev.phys_port_cnt = 1; + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); + if (!ndev) { + ret = -ENODEV; + ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); + goto free_ib_device; + } + ether_addr_copy(mac_addr, ndev->dev_addr); + addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); + ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); + goto free_ib_device; + } + + dev->nb.notifier_call = mana_ib_netdev_event; + ret = register_netdevice_notifier(&dev->nb); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d", + ret); + goto free_ib_device; + } + + ret = mana_ib_gd_query_adapter_caps(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret); + goto deregister_net_notifier; + } + + ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); + + ret = mana_ib_create_eqs(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); + goto deregister_net_notifier; + } + + ret = mana_ib_gd_create_rnic_adapter(dev); + if (ret) + goto destroy_eqs; + + ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", ret); + goto destroy_rnic; + } + } else { + dev->ib_dev.phys_port_cnt = mc->num_ports; + ret = mana_eth_query_adapter_caps(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to query ETH device caps, ret %d", ret); + goto free_ib_device; + } } - dev->av_pool = dma_pool_create("mana_ib_av", mdev->gdma_context->dev, - MANA_AV_BUFFER_SIZE, MANA_AV_BUFFER_SIZE, 0); + dev->av_pool = dma_pool_create("mana_ib_av", gc->dev, MANA_AV_BUFFER_SIZE, + MANA_AV_BUFFER_SIZE, 0); if (!dev->av_pool) { ret = -ENOMEM; goto destroy_rnic; } - ret = ib_register_device(&dev->ib_dev, "mana_%d", - mdev->gdma_context->dev); + ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, + mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); + + ret = ib_register_device(&dev->ib_dev, mana_ib_is_rnic(dev) ? "mana_%d" : "manae_%d", + gc->dev); if (ret) goto deallocate_pool; @@ -208,15 +200,16 @@ static int mana_ib_probe(struct auxiliary_device *adev, deallocate_pool: dma_pool_destroy(dev->av_pool); destroy_rnic: - xa_destroy(&dev->qp_table_wq); - mana_ib_gd_destroy_rnic_adapter(dev); + if (mana_ib_is_rnic(dev)) + mana_ib_gd_destroy_rnic_adapter(dev); destroy_eqs: - mana_ib_destroy_eqs(dev); + if (mana_ib_is_rnic(dev)) + mana_ib_destroy_eqs(dev); deregister_net_notifier: - unregister_netdevice_notifier(&dev->nb); -deregister_device: - mana_gd_deregister_device(dev->gdma_dev); + if (mana_ib_is_rnic(dev)) + unregister_netdevice_notifier(&dev->nb); free_ib_device: + xa_destroy(&dev->qp_table_wq); ib_dealloc_device(&dev->ib_dev); return ret; } @@ -227,25 +220,24 @@ static void mana_ib_remove(struct auxiliary_device *adev) ib_unregister_device(&dev->ib_dev); dma_pool_destroy(dev->av_pool); + if (mana_ib_is_rnic(dev)) { + mana_ib_gd_destroy_rnic_adapter(dev); + mana_ib_destroy_eqs(dev); + unregister_netdevice_notifier(&dev->nb); + } xa_destroy(&dev->qp_table_wq); - mana_ib_gd_destroy_rnic_adapter(dev); - mana_ib_destroy_eqs(dev); - unregister_netdevice_notifier(&dev->nb); - mana_gd_deregister_device(dev->gdma_dev); ib_dealloc_device(&dev->ib_dev); } static const struct auxiliary_device_id mana_id_table[] = { - { - .name = "mana.rdma", - }, + { .name = "mana.rdma", }, + { .name = "mana.eth", }, {}, }; MODULE_DEVICE_TABLE(auxiliary, mana_id_table); static struct auxiliary_driver mana_driver = { - .name = "rdma", .probe = mana_ib_probe, .remove = mana_ib_remove, .id_table = mana_id_table, diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index eda9c5b971de..41a24a186f9d 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -4,6 +4,7 @@ */ #include "mana_ib.h" +#include "linux/pci.h" void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, u32 port) @@ -243,7 +244,6 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, struct mana_ib_queue *queue) { - struct gdma_context *gc = mdev_to_gc(mdev); struct gdma_queue_spec spec = {}; int err; @@ -252,7 +252,7 @@ int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_qu spec.type = type; spec.monitor_avl_buf = false; spec.queue_size = size; - err = mana_gd_create_mana_wq_cq(&gc->mana_ib, &spec, &queue->kmem); + err = mana_gd_create_mana_wq_cq(mdev->gdma_dev, &spec, &queue->kmem); if (err) return err; /* take ownership into mana_ib from mana */ @@ -479,7 +479,7 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, { unsigned long page_sz; - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); + page_sz = ib_umem_find_best_pgsz(umem, dev->adapter_caps.page_size_cap, virt); if (!page_sz) { ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n"); return -EINVAL; @@ -494,7 +494,7 @@ int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_ume unsigned long page_sz; /* Hardware requires dma region to align to chosen page size */ - page_sz = ib_umem_find_best_pgoff(umem, PAGE_SZ_BM, 0); + page_sz = ib_umem_find_best_pgoff(umem, dev->adapter_caps.page_size_cap, 0); if (!page_sz) { ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n"); return -EINVAL; @@ -551,6 +551,7 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); struct ib_port_attr attr; int err; @@ -560,10 +561,12 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; - immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; - if (port_num == 1) { - immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + if (mana_ib_is_rnic(dev)) { + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } else { + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; } return 0; @@ -572,12 +575,14 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { - struct mana_ib_dev *dev = container_of(ibdev, - struct mana_ib_dev, ib_dev); + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct pci_dev *pdev = to_pci_dev(mdev_to_gc(dev)->dev); memset(props, 0, sizeof(*props)); + props->vendor_id = pdev->vendor; + props->vendor_part_id = dev->gdma_dev->dev_id.type; props->max_mr_size = MANA_IB_MAX_MR_SIZE; - props->page_size_cap = PAGE_SZ_BM; + props->page_size_cap = dev->adapter_caps.page_size_cap; props->max_qp = dev->adapter_caps.max_qp_count; props->max_qp_wr = dev->adapter_caps.max_qp_wr; props->device_cap_flags = IB_DEVICE_RC_RNR_NAK_GEN; @@ -596,6 +601,8 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, props->max_ah = INT_MAX; props->max_pkeys = 1; props->local_ca_ack_delay = MANA_CA_ACK_DELAY; + if (!mana_ib_is_rnic(dev)) + props->raw_packet_caps = IB_RAW_PACKET_CAP_IP_CSUM; return 0; } @@ -603,6 +610,7 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, int mana_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); struct net_device *ndev = mana_ib_get_netdev(ibdev, port); if (!ndev) @@ -623,7 +631,7 @@ int mana_ib_query_port(struct ib_device *ibdev, u32 port, props->active_width = IB_WIDTH_4X; props->active_speed = IB_SPEED_EDR; props->pkey_tbl_len = 1; - if (port == 1) { + if (mana_ib_is_rnic(dev)) { props->gid_tbl_len = 16; props->port_cap_flags = IB_PORT_CM_SUP; props->ip_gids = true; @@ -696,6 +704,41 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) caps->max_recv_sge_count = resp.max_recv_sge_count; caps->feature_flags = resp.feature_flags; + caps->page_size_cap = PAGE_SZ_BM; + if (mdev_to_gc(dev)->pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB) + caps->page_size_cap |= (SZ_4M | SZ_1G | SZ_2G); + + return 0; +} + +int mana_eth_query_adapter_caps(struct mana_ib_dev *dev) +{ + struct mana_ib_adapter_caps *caps = &dev->adapter_caps; + struct gdma_query_max_resources_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES, + sizeof(req), sizeof(resp)); + + err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&dev->ib_dev, + "Failed to query adapter caps err %d", err); + return err; + } + + caps->max_qp_count = min_t(u32, resp.max_sq, resp.max_rq); + caps->max_cq_count = resp.max_cq; + caps->max_mr_count = resp.max_mst; + caps->max_pd_count = 0x6000; + caps->max_qp_wr = min_t(u32, + 0x100000 / GDMA_MAX_SQE_SIZE, + 0x100000 / GDMA_MAX_RQE_SIZE); + caps->max_send_sge_count = 30; + caps->max_recv_sge_count = 15; + caps->page_size_cap = PAGE_SZ_BM; + return 0; } @@ -740,7 +783,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; spec.eq.msix_index = 0; - err = mana_gd_create_mana_eq(&gc->mana_ib, &spec, &mdev->fatal_err_eq); + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq); if (err) return err; @@ -791,7 +834,7 @@ int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req), sizeof(resp)); req.hdr.req.msg_version = GDMA_MESSAGE_V2; - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.notify_eq_id = mdev->fatal_err_eq->id; if (mdev->adapter_caps.feature_flags & MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT) @@ -816,7 +859,7 @@ int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev) gc = mdev_to_gc(mdev); mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); @@ -843,7 +886,7 @@ int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context) } mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.op = ADDR_OP_ADD; req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; @@ -873,7 +916,7 @@ int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context) } mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.op = ADDR_OP_REMOVE; req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; @@ -896,7 +939,7 @@ int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 int err; mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_MAC_ADDR, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.op = op; copy_in_reverse(req.mac_addr, mac, ETH_ALEN); @@ -917,8 +960,11 @@ int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 do struct mana_rnic_create_cq_req req = {}; int err; + if (!mdev->eqs) + return -EINVAL; + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_CQ, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.gdma_region = cq->queue.gdma_region; req.eq_id = mdev->eqs[cq->comp_vector]->id; @@ -950,7 +996,7 @@ int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) return 0; mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_CQ, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.cq_handle = cq->cq_handle; @@ -976,7 +1022,7 @@ int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, int err, i; mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_RC_QP, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.pd_handle = pd->pd_handle; req.send_cq_handle = send_cq->cq_handle; @@ -1012,7 +1058,7 @@ int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) int err; mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_RC_QP, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.rc_qp_handle = qp->qp_handle; err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); @@ -1035,7 +1081,7 @@ int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, int err, i; mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_UD_QP, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.pd_handle = pd->pd_handle; req.send_cq_handle = send_cq->cq_handle; @@ -1070,7 +1116,7 @@ int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) int err; mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_UD_QP, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.qp_handle = qp->qp_handle; err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 6903946677e5..42bebd6cd4f7 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -60,6 +60,7 @@ struct mana_ib_adapter_caps { u32 max_recv_sge_count; u32 max_inline_data_size; u64 feature_flags; + u64 page_size_cap; }; struct mana_ib_queue { @@ -543,6 +544,11 @@ static inline void mana_put_qp_ref(struct mana_ib_qp *qp) complete(&qp->free); } +static inline bool mana_ib_is_rnic(struct mana_ib_dev *mdev) +{ + return mdev->gdma_dev->dev_id.type == GDMA_DEVICE_MANA_IB; +} + static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 port) { struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); @@ -642,6 +648,7 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev); +int mana_eth_query_adapter_caps(struct mana_ib_dev *mdev); int mana_ib_create_eqs(struct mana_ib_dev *mdev); diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index f99557ec7767..6d974d0a8400 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -5,8 +5,8 @@ #include "mana_ib.h" -#define VALID_MR_FLAGS \ - (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) +#define VALID_MR_FLAGS (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |\ + IB_ACCESS_REMOTE_ATOMIC | IB_ZERO_BASED) #define VALID_DMA_MR_FLAGS (IB_ACCESS_LOCAL_WRITE) @@ -24,6 +24,9 @@ mana_ib_verbs_to_gdma_access_flags(int access_flags) if (access_flags & IB_ACCESS_REMOTE_READ) flags |= GDMA_ACCESS_FLAG_REMOTE_READ; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + flags |= GDMA_ACCESS_FLAG_REMOTE_ATOMIC; + return flags; } @@ -48,7 +51,10 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, req.gva.virtual_address = mr_params->gva.virtual_address; req.gva.access_flags = mr_params->gva.access_flags; break; - + case GDMA_MR_TYPE_ZBVA: + req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle; + req.zbva.access_flags = mr_params->zbva.access_flags; + break; default: ibdev_dbg(&dev->ib_dev, "invalid param (GDMA_MR_TYPE) passed, type %d\n", @@ -144,11 +150,18 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, dma_region_handle); mr_params.pd_handle = pd->pd_handle; - mr_params.mr_type = GDMA_MR_TYPE_GVA; - mr_params.gva.dma_region_handle = dma_region_handle; - mr_params.gva.virtual_address = iova; - mr_params.gva.access_flags = - mana_ib_verbs_to_gdma_access_flags(access_flags); + if (access_flags & IB_ZERO_BASED) { + mr_params.mr_type = GDMA_MR_TYPE_ZBVA; + mr_params.zbva.dma_region_handle = dma_region_handle; + mr_params.zbva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + } else { + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + } err = mana_ib_gd_create_mr(dev, mr, &mr_params); if (err) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index c928af58f38b..14fd7d6c54a2 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -635,7 +635,6 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, { struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); - struct gdma_context *gc = mdev_to_gc(mdev); u32 doorbell, queue_size; int i, err; @@ -654,7 +653,7 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, goto destroy_queues; } } - doorbell = gc->mana_ib.doorbell; + doorbell = mdev->gdma_dev->doorbell; err = create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr, sizeof(struct ud_rq_shadow_wqe)); @@ -736,7 +735,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int err; mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp)); - req.hdr.dev_id = gc->mana_ib.dev_id; + req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.qp_handle = qp->qp_handle; req.qp_state = attr->qp_state; diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 33f525b744f2..e279e69b9a51 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -43,7 +43,7 @@ #define MAX_VFS 80 #define MAX_PEND_REQS_PER_FUNC 4 -#define MAD_TIMEOUT_MS 2000 +#define MAD_TIMEOUT_SEC 2 #define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) #define mcg_error(fmt, arg...) pr_err(fmt, ##arg) @@ -270,7 +270,7 @@ static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad if (!ret) { /* calls mlx4_ib_mcg_timeout_handler */ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, - msecs_to_jiffies(MAD_TIMEOUT_MS)); + secs_to_jiffies(MAD_TIMEOUT_SEC)); } return ret; @@ -309,7 +309,7 @@ static int send_leave_to_wire(struct mcast_group *group, u8 join_state) if (!ret) { /* calls mlx4_ib_mcg_timeout_handler */ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, - msecs_to_jiffies(MAD_TIMEOUT_MS)); + secs_to_jiffies(MAD_TIMEOUT_SEC)); } return ret; @@ -1091,7 +1091,7 @@ static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy for (i = 0; i < MAX_VFS; ++i) clean_vf_mcast(ctx, i); - end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + end = jiffies + secs_to_jiffies(MAD_TIMEOUT_SEC + 3); do { count = 0; mutex_lock(&ctx->mcg_table_lock); diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 0ff9f18a71e8..680627f1de33 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -1645,11 +1645,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); } -enum { - LEFTOVERS_MC, - LEFTOVERS_UC, -}; - static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, @@ -1659,43 +1654,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de struct mlx5_ib_flow_handler *handler = NULL; static struct { - struct ib_flow_attr flow_attr; struct ib_flow_spec_eth eth_flow; - } leftovers_specs[] = { - [LEFTOVERS_MC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {0x1} } - } - }, - [LEFTOVERS_UC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {} } - } - } - }; + struct ib_flow_attr flow_attr; + } leftovers_wc = { .flow_attr = { .num_of_specs = 1, + .size = sizeof(leftovers_wc) }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = { .dst_mac = { 0x1 } }, + .val = { .dst_mac = { 0x1 } } } }; - handler = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_MC].flow_attr, - dst); + static struct { + struct ib_flow_spec_eth eth_flow; + struct ib_flow_attr flow_attr; + } leftovers_uc = { .flow_attr = { .num_of_specs = 1, + .size = sizeof(leftovers_uc) }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = { .dst_mac = { 0x1 } }, + .val = { .dst_mac = {} } } }; + + handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst); if (!IS_ERR(handler) && flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { handler_ucast = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_UC].flow_attr, - dst); + &leftovers_uc.flow_attr, dst); if (IS_ERR(handler_ucast)) { mlx5_del_flow_rules(handler->rule); ft_prio->refcount--; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d07cacaa0abd..ce7610740412 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -485,6 +485,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_XDR; + break; case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8): *active_width = IB_WIDTH_8X; *active_speed = IB_SPEED_HDR; @@ -493,10 +497,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_XDR; + break; case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8): *active_width = IB_WIDTH_8X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_XDR; + break; default: return -EINVAL; } @@ -4422,17 +4434,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) mlx5_core_native_port_num(dev->mdev) - 1); } -static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) -{ - dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); - return PTR_ERR_OR_ZERO(dev->mdev->priv.uar); -} - -static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); -} - static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) { int err; @@ -4662,9 +4663,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), @@ -4722,9 +4720,6 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index ace2df3e1d9f..fde859d207ae 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -351,6 +351,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_PD BIT(4) #define MLX5_IB_UPD_XLT_ACCESS BIT(5) #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) +#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * @@ -1005,7 +1006,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, - MLX5_IB_STAGE_UAR, MLX5_IB_STAGE_BFREG, MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, @@ -1473,8 +1473,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev); -void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags); +int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags); int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, @@ -1495,8 +1495,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) { return 0; } -static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags) {} +static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) +{ + return -EOPNOTSUPP; +} static inline int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 5fbebafc8774..6dd813bac5b2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -525,7 +525,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) ent->fill_to_high_water = false; if (ent->pending) queue_delayed_work(ent->dev->cache.wq, &ent->dwork, - msecs_to_jiffies(1000)); + secs_to_jiffies(1)); else mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } @@ -576,7 +576,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) "add keys command failed, err %d\n", err); queue_delayed_work(cache->wq, &ent->dwork, - msecs_to_jiffies(1000)); + secs_to_jiffies(1)); } } } else if (ent->mkeys_queue.ci > 2 * ent->limit) { @@ -2051,7 +2051,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) ent->in_use--; if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { mod_delayed_work(ent->dev->cache.wq, &ent->dwork, - msecs_to_jiffies(30 * 1000)); + secs_to_jiffies(30)); ent->tmp_cleanup_scheduled = true; } spin_unlock_irq(&ent->mkeys_queue.lock); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 86d8fa63bf69..eaa2f9f5f3a9 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -34,6 +34,9 @@ #include <linux/kernel.h> #include <linux/dma-buf.h> #include <linux/dma-resv.h> +#include <linux/hmm.h> +#include <linux/hmm-dma.h> +#include <linux/pci-p2pdma.h> #include "mlx5_ib.h" #include "cmd.h" @@ -158,41 +161,50 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, } } -static u64 umem_dma_to_mtt(dma_addr_t umem_dma) -{ - u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; - - if (umem_dma & ODP_READ_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_READ; - if (umem_dma & ODP_WRITE_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_WRITE; - - return mtt_entry; -} - -static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags) +static int populate_mtt(__be64 *pas, size_t start, size_t nentries, + struct mlx5_ib_mr *mr, int flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - dma_addr_t pa; + bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; + struct pci_p2pdma_map_state p2pdma_state = {}; + struct ib_device *dev = odp->umem.ibdev; size_t i; if (flags & MLX5_IB_UPD_XLT_ZAP) - return; + return 0; for (i = 0; i < nentries; i++) { - pa = odp->dma_list[idx + i]; - pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + unsigned long pfn = odp->map.pfn_list[start + i]; + dma_addr_t dma_addr; + + pfn = odp->map.pfn_list[start + i]; + if (!(pfn & HMM_PFN_VALID)) + /* ODP initialization */ + continue; + + dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map, + start + i, &p2pdma_state); + if (ib_dma_mapping_error(dev, dma_addr)) + return -EFAULT; + + dma_addr |= MLX5_IB_MTT_READ; + if ((pfn & HMM_PFN_WRITE) && !downgrade) + dma_addr |= MLX5_IB_MTT_WRITE; + + pas[i] = cpu_to_be64(dma_addr); + odp->npages++; } + return 0; } -void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags) +int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) { if (flags & MLX5_IB_UPD_XLT_INDIRECT) { populate_klm(xlt, idx, nentries, mr, flags); + return 0; } else { - populate_mtt(xlt, idx, nentries, mr, flags); + return populate_mtt(xlt, idx, nentries, mr, flags); } } @@ -303,8 +315,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem_odp->dma_list[idx] & - (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) { if (!in_block) { blk_start_idx = idx; in_block = 1; @@ -687,7 +698,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, { int page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - u64 access_mask; + u64 access_mask = 0; u64 start_idx; bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; @@ -695,12 +706,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; + if (flags & MLX5_PF_FLAGS_DOWNGRADE) + xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE; + page_shift = odp->page_shift; start_idx = (user_va - ib_umem_start(odp)) >> page_shift; - access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) - access_mask |= ODP_WRITE_ALLOWED_BIT; + access_mask |= HMM_PFN_WRITE; np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); if (np < 0) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d3dcc272200a..146d03ae40bd 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) spin_lock_irqsave(&table->lock, flags); common = radix_tree_lookup(&table->tree, rsn); - if (common) + if (common && !common->invalid) refcount_inc(&common->refcount); + else + common = NULL; spin_unlock_irqrestore(&table->lock, flags); @@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev, return 0; } +static void modify_resource_common_state(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, + bool invalid) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + qp->common.invalid = invalid; + spin_unlock_irqrestore(&table->lock, flags); +} + static void destroy_resource_common(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { @@ -609,8 +623,20 @@ err_destroy_rq: int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, struct mlx5_core_qp *rq) { + int ret; + + /* The rq destruction can be called again in case it fails, hence we + * mark the common resource as invalid and only once FW destruction + * is completed successfully we actually destroy the resources. + */ + modify_resource_common_state(dev, rq, true); + ret = destroy_rq_tracked(dev, rq->qpn, rq->uid); + if (ret) { + modify_resource_common_state(dev, rq, false); + return ret; + } destroy_resource_common(dev, rq); - return destroy_rq_tracked(dev, rq->qpn, rq->uid); + return 0; } static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 793f3c5c4d01..5be4426a2884 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -840,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, size_to_map = npages * desc_size; dma_sync_single_for_cpu(ddev, sg.addr, sg.length, DMA_TO_DEVICE); - mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + /* + * npages is the maximum number of pages to map, but we + * can't guarantee that all pages are actually mapped. + * + * For example, if page is p2p of type which is not supported + * for mapping, the number of pages mapped will be less than + * requested. + */ + err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + if (err) + return err; dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT); diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 192f83fd7c8a..dacb8ceeebe0 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -144,7 +144,7 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) buddy->max_order = max_order; spin_lock_init(&buddy->lock); - buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *), + buddy->bits = kcalloc(buddy->max_order + 1, sizeof(*buddy->bits), GFP_KERNEL); buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, GFP_KERNEL); diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index f948b76f984d..3fbf99757b11 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -56,7 +56,7 @@ static int usnic_uiom_dma_fault(struct iommu_domain *domain, unsigned long iova, int flags, void *token) { - usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", + usnic_err("Device %s iommu fault domain 0x%p va 0x%lx flags 0x%x\n", dev_name(dev), domain, iova, flags); return -ENOSYS; diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index c180e7ebcfc5..1ed5b63f8afc 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" - depends on INET && PCI && INFINIBAND + depends on INET && PCI && INFINIBAND && 64BIT depends on INFINIBAND_VIRT_DMA select NET_UDP_TUNNEL select CRC32 diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index b248c68bf9b1..3a77d6db1720 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -101,6 +101,8 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 0bc3fbb6554f..876702058c84 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -70,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir); int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val); -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); @@ -193,13 +193,16 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) /* rxe_odp.c */ extern const struct mmu_interval_notifier_ops rxe_mn_ops; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access_flags, struct rxe_mr *mr); int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir); -int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val); +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length); +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -212,9 +215,19 @@ static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, { return -EOPNOTSUPP; } -static inline int +static inline enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) + u64 compare, u64 swap_add, u64 *orig_val) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + return -EOPNOTSUPP; +} +static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, + u64 iova, u64 value) { return RESPST_ERR_UNSUPPORTED_OPCODE; } diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 432d864c3ce9..bcb97b3ea58a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -424,7 +424,7 @@ err1: return err; } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { unsigned int page_offset; unsigned long index; @@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) int err; u8 *va; - /* mr must be valid even if length is zero */ - if (WARN_ON(!mr)) - return -EINVAL; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; - err = mr_check_range(mr, iova, length); if (err) return err; @@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) if (!page) return -EFAULT; bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + mr_page_size(mr) - page_offset); va = kmap_local_page(page); arch_wb_cache_pmem(va + page_offset, bytes); @@ -468,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) return 0; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) +{ + int err; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + if (is_odp_mr(mr)) + err = rxe_odp_flush_pmem_iova(mr, start, length); + else + err = rxe_mr_flush_pmem_iova(mr, start, length); + + return err; +} + /* Guarantee atomicity of atomic operations at the machine level. */ DEFINE_SPINLOCK(atomic_ops_lock); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) { unsigned int page_offset; struct page *page; @@ -524,27 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, kunmap_local(va); - return 0; + return RESPST_NONE; } -#if defined CONFIG_64BIT -/* only implemented or called for 64 bit architectures */ -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { unsigned int page_offset; struct page *page; u64 *va; - /* ODP is not supported right now. WIP. */ - if (is_odp_mr(mr)) - return RESPST_ERR_UNSUPPORTED_OPCODE; - - /* See IBA oA19-28 */ - if (unlikely(mr->state != RXE_MR_STATE_VALID)) { - rxe_dbg_mr(mr, "mr not in valid state\n"); - return RESPST_ERR_RKEY_VIOLATION; - } - if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); page = ib_virt_dma_to_page(iova); @@ -572,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } va = kmap_local_page(page); - /* Do atomic write after all prior operations have completed */ smp_store_release(&va[page_offset >> 3], value); - kunmap_local(va); - return 0; -} -#else -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; + return RESPST_NONE; } -#endif int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index 9f6e2bb2a269..dbc5a5600eb7 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -4,6 +4,7 @@ */ #include <linux/hmm.h> +#include <linux/libnvdimm.h> #include <rdma/ib_umem_odp.h> @@ -26,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, start = max_t(u64, ib_umem_start(umem_odp), range->start); end = min_t(u64, ib_umem_end(umem_odp), range->end); - /* update umem_odp->dma_list */ + /* update umem_odp->map.pfn_list */ ib_umem_odp_unmap_dma_pages(umem_odp, start, end); mutex_unlock(&umem_odp->umem_mutex); @@ -44,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); - u64 access_mask; + u64 access_mask = 0; int np; - access_mask = ODP_READ_ALLOWED_BIT; if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) - access_mask |= ODP_WRITE_ALLOWED_BIT; + access_mask |= HMM_PFN_WRITE; /* * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success. @@ -124,8 +124,8 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, return err; } -static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, - u64 iova, int length, u32 perm) +static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova, + int length) { bool need_fault = false; u64 addr; @@ -137,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, while (addr < iova + length) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - if (!(umem_odp->dma_list[idx] & perm)) { + if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) { need_fault = true; break; } @@ -147,23 +147,28 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, return need_fault; } +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) +{ + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; +} + +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) +{ + return iova & (BIT(umem_odp->page_shift) - 1); +} + static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); bool need_fault; - u64 perm; int err; if (unlikely(length < 1)) return -EINVAL; - perm = ODP_READ_ALLOWED_BIT; - if (!(flags & RXE_PAGEFAULT_RDONLY)) - perm |= ODP_WRITE_ALLOWED_BIT; - mutex_lock(&umem_odp->umem_mutex); - need_fault = rxe_check_pagefault(umem_odp, iova, length, perm); + need_fault = rxe_check_pagefault(umem_odp, iova, length); if (need_fault) { mutex_unlock(&umem_odp->umem_mutex); @@ -173,7 +178,7 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u if (err < 0) return err; - need_fault = rxe_check_pagefault(umem_odp, iova, length, perm); + need_fault = rxe_check_pagefault(umem_odp, iova, length); if (need_fault) return -EFAULT; } @@ -190,13 +195,13 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, size_t offset; u8 *user_va; - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - offset = iova & (BIT(umem_odp->page_shift) - 1); + idx = rxe_odp_iova_to_index(umem_odp, iova); + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); while (length > 0) { u8 *src, *dest; - page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); user_va = kmap_local_page(page); if (!user_va) return -EFAULT; @@ -255,8 +260,9 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, return err; } -static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, + int opcode, u64 compare, + u64 swap_add, u64 *orig_val) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); unsigned int page_offset; @@ -277,9 +283,9 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return RESPST_ERR_RKEY_VIOLATION; } - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - page_offset = iova & (BIT(umem_odp->page_shift) - 1); - page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); + idx = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); if (!page) return RESPST_ERR_RKEY_VIOLATION; @@ -304,11 +310,11 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, kunmap_local(va); - return 0; + return RESPST_NONE; } -int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); int err; @@ -324,3 +330,91 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return err; } + +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + err = rxe_odp_map_range_and_lock(mr, iova, length, + RXE_PAGEFAULT_DEFAULT); + if (err) + return err; + + while (length > 0) { + index = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return -EFAULT; + } + + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + mutex_unlock(&umem_odp->umem_mutex); + + return 0; +} + +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + int err; + u64 *va; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value), + RXE_PAGEFAULT_DEFAULT); + if (err) + return RESPST_ERR_RKEY_VIOLATION; + + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + index = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return RESPST_ERR_RKEY_VIOLATION; + } + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + mutex_unlock(&umem_odp->umem_mutex); + rxe_dbg_mr(mr, "misaligned address\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + kunmap_local(va); + + mutex_unlock(&umem_odp->umem_mutex); + + return RESPST_NONE; +} diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 003f681e5dc0..767870568372 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -53,12 +53,9 @@ enum rxe_device_param { | IB_DEVICE_MEM_WINDOW | IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT -#ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, -#else - | IB_DEVICE_MEM_WINDOW_TYPE_2B, -#endif /* CONFIG_64BIT */ + RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 7975fb0e2782..f2af3e0aef35 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -811,7 +811,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work) spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; - if (qp_type(qp) == IB_QPT_RC) { + /* In the function timer_setup, .function is initialized. If .function + * is NULL, it indicates the function timer_setup is not called, the + * timer is not initialized. Or else, the timer is initialized. + */ + if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function && + qp->rnr_nak_timer.function) { timer_delete_sync(&qp->retrans_timer); timer_delete_sync(&qp->rnr_nak_timer); } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 5d9174e408db..711f73e0bbb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp, struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; - /* ODP is not supported right now. WIP. */ - if (is_odp_mr(mr)) - return RESPST_ERR_UNSUPPORTED_OPCODE; - /* oA19-14, oA19-15 */ if (res && res->replay) return RESPST_ACKNOWLEDGE; @@ -753,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp, value = *(u64 *)payload_addr(pkt); iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_write(mr, iova, value); + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (is_odp_mr(mr)) + err = rxe_odp_do_atomic_write(mr, iova, value); + else + err = rxe_mr_do_atomic_write(mr, iova, value); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 80332638d9e3..6f8f353e9583 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task) /* do_task is a wrapper for the three tasks (requester, * completer, responder) and calls them in a loop until - * they return a non-zero value. It is called either - * directly by rxe_run_task or indirectly if rxe_sched_task - * schedules the task. They must call __reserve_if_idle to - * move the task to busy before calling or scheduling. - * The task can also be moved to drained or invalid - * by calls to rxe_cleanup_task or rxe_disable_task. - * In that case tasks which get here are not executed but - * just flushed. The tasks are designed to look to see if - * there is work to do and then do part of it before returning - * here with a return value of zero until all the work - * has been consumed then it returns a non-zero value. + * they return a non-zero value. It is called indirectly + * when rxe_sched_task schedules the task. They must + * call __reserve_if_idle to move the task to busy before + * calling or scheduling. The task can also be moved to + * drained or invalid by calls to rxe_cleanup_task or + * rxe_disable_task. In that case tasks which get here + * are not executed but just flushed. The tasks are + * designed to look to see if there is work to do and + * then do part of it before returning here with a return + * value of zero until all the work has been consumed then + * it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. * If the limit is hit and work remains the task is rescheduled. @@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); } -/* run the task inline if it is currently idle - * cannot call do_task holding the lock - */ -void rxe_run_task(struct rxe_task *task) -{ - unsigned long flags; - bool run; - - WARN_ON(rxe_read(task->qp) <= 0); - - spin_lock_irqsave(&task->lock, flags); - run = __reserve_if_idle(task); - spin_unlock_irqrestore(&task->lock, flags); - - if (run) - do_task(task); -} - /* schedule the task to run later as a work queue entry. * the queue_work call can be called holding * the lock. diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index a63e258b3d66..a8c9a77b6027 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -void rxe_run_task(struct rxe_task *task); - void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 385067e07faf..e49c8a76e22e 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -738,7 +738,7 @@ static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__) #define siw_dbg_cep(cep, fmt, ...) \ - ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt, \ + ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \ cep, __func__, ##__VA_ARGS__) void siw_cq_flush(struct siw_cq *cq); diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index f3c2226aff94..25b3c741b66b 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -72,7 +72,7 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) wc->opcode = map_wc_opcode[cqe->opcode]; wc->status = map_cqe_status[cqe->status].ib; siw_dbg_cq(cq, - "idx %u, type %d, flags %2x, id 0x%pK\n", + "idx %u, type %d, flags %2x, id 0x%p\n", cq->cq_get % cq->num_cqe, cqe->opcode, cqe->flags, (void *)(uintptr_t)cqe->id); } else { diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index dcb963607c8b..d5ddeb17bd22 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -18,30 +18,6 @@ #define SIW_STAG_MAX_INDEX 0x00ffffff /* - * The code avoids special Stag of zero and tries to randomize - * STag values between 1 and SIW_STAG_MAX_INDEX. - */ -int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) -{ - struct xa_limit limit = XA_LIMIT(1, SIW_STAG_MAX_INDEX); - u32 id, next; - - get_random_bytes(&next, 4); - next &= SIW_STAG_MAX_INDEX; - - if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, - GFP_KERNEL) < 0) - return -ENOMEM; - - /* Set the STag index part */ - m->stag = id << 8; - - siw_dbg_mem(m, "new MEM object\n"); - - return 0; -} - -/* * siw_mem_id2obj() * * resolves memory from stag given by id. might be called from: @@ -181,10 +157,10 @@ int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, */ if (addr < mem->va || addr + len > mem->va + mem->len) { siw_dbg_pd(pd, "MEM interval len %d\n", len); - siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", + siw_dbg_pd(pd, "[0x%p, 0x%p] out of bounds\n", (void *)(uintptr_t)addr, (void *)(uintptr_t)(addr + len)); - siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", + siw_dbg_pd(pd, "[0x%p, 0x%p] STag=0x%08x\n", (void *)(uintptr_t)mem->va, (void *)(uintptr_t)(mem->va + mem->len), mem->stag); diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h index e74cfcd6dbc1..8e769d30e2ac 100644 --- a/drivers/infiniband/sw/siw/siw_mem.h +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -12,7 +12,6 @@ void siw_umem_release(struct siw_umem *umem); struct siw_pbl *siw_pbl_alloc(u32 num_buf); dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); -int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); int siw_invalidate_stag(struct ib_pd *pd, u32 stag); int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, enum ib_access_flags perms, int len); diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 32554eba1eac..a10820e33887 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -38,7 +38,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, p = siw_get_upage(umem, dest_addr); if (unlikely(!p)) { - pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n", + pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", __func__, qp_id(rx_qp(srx)), (void *)(uintptr_t)dest_addr, (void *)(uintptr_t)umem->fp_addr); @@ -51,7 +51,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, pg_off = dest_addr & ~PAGE_MASK; bytes = min(len, (int)PAGE_SIZE - pg_off); - siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes); + siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); dest = kmap_atomic(p); rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, @@ -105,11 +105,11 @@ static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) { int rv; - siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len); + siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); if (unlikely(rv)) { - pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n", + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", qp_id(rx_qp(srx)), __func__, len, kva, rv); return rv; diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index fd7b266a221b..2b2a7b8e93b0 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -936,7 +936,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, rv = -EINVAL; break; } - siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", sqe->opcode, sqe->flags, (void *)(uintptr_t)sqe->id); @@ -1102,7 +1102,7 @@ int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, siw_dbg_qp(qp, "error %d\n", rv); *bad_wr = wr; } - return rv > 0 ? 0 : rv; + return rv; } int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) @@ -1332,7 +1332,7 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, struct siw_device *sdev = to_siw_dev(pd->device); int rv; - siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", + siw_dbg_pd(pd, "start: 0x%p, va: 0x%p, len: %llu\n", (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, (unsigned long long)len); @@ -1525,7 +1525,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, mem->len = base_mr->length; mem->va = base_mr->iova; siw_dbg_mem(mem, - "%llu bytes, start 0x%pK, %u SLE to %u entries\n", + "%llu bytes, start 0x%p, %u SLE to %u entries\n", mem->len, (void *)(uintptr_t)mem->va, num_sle, pbl->num_buf); } diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index a775e4dbe06f..98f7205ec8fb 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -27,6 +27,7 @@ #include <linux/msi.h> #include <linux/of_iommu.h> #include <linux/pci.h> +#include <linux/pci-p2pdma.h> #include <linux/scatterlist.h> #include <linux/spinlock.h> #include <linux/swiotlb.h> @@ -1137,6 +1138,54 @@ void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); } +static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iova_domain *iovad = &domain->iova_cookie->iovad; + + if (!is_swiotlb_active(dev)) { + dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n"); + return (phys_addr_t)DMA_MAPPING_ERROR; + } + + trace_swiotlb_bounced(dev, phys, size); + + phys = swiotlb_tbl_map_single(dev, phys, size, iova_mask(iovad), dir, + attrs); + + /* + * Untrusted devices should not see padding areas with random leftover + * kernel data, so zero the pre- and post-padding. + * swiotlb_tbl_map_single() has initialized the bounce buffer proper to + * the contents of the original memory buffer. + */ + if (phys != (phys_addr_t)DMA_MAPPING_ERROR && dev_is_untrusted(dev)) { + size_t start, virt = (size_t)phys_to_virt(phys); + + /* Pre-padding */ + start = iova_align_down(iovad, virt); + memset((void *)start, 0, virt - start); + + /* Post-padding */ + start = virt + size; + memset((void *)start, 0, iova_align(iovad, start) - start); + } + + return phys; +} + +/* + * Checks if a physical buffer has unaligned boundaries with respect to + * the IOMMU granule. Returns non-zero if either the start or end + * address is not aligned to the granule boundary. + */ +static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys, + size_t size) +{ + return iova_offset(iovad, phys | size); +} + dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -1150,42 +1199,14 @@ dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, dma_addr_t iova, dma_mask = dma_get_mask(dev); /* - * If both the physical buffer start address and size are - * page aligned, we don't need to use a bounce page. + * If both the physical buffer start address and size are page aligned, + * we don't need to use a bounce page. */ if (dev_use_swiotlb(dev, size, dir) && - iova_offset(iovad, phys | size)) { - if (!is_swiotlb_active(dev)) { - dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n"); + iova_unaligned(iovad, phys, size)) { + phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs); + if (phys == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; - } - - trace_swiotlb_bounced(dev, phys, size); - - phys = swiotlb_tbl_map_single(dev, phys, size, - iova_mask(iovad), dir, attrs); - - if (phys == DMA_MAPPING_ERROR) - return DMA_MAPPING_ERROR; - - /* - * Untrusted devices should not see padding areas with random - * leftover kernel data, so zero the pre- and post-padding. - * swiotlb_tbl_map_single() has initialized the bounce buffer - * proper to the contents of the original memory buffer. - */ - if (dev_is_untrusted(dev)) { - size_t start, virt = (size_t)phys_to_virt(phys); - - /* Pre-padding */ - start = iova_align_down(iovad, virt); - memset((void *)start, 0, virt - start); - - /* Post-padding */ - start = virt + size; - memset((void *)start, 0, - iova_align(iovad, start) - start); - } } if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) @@ -1359,7 +1380,6 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, struct scatterlist *s, *prev = NULL; int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs); struct pci_p2pdma_map_state p2pdma_state = {}; - enum pci_p2pdma_map_type map; dma_addr_t iova; size_t iova_len = 0; unsigned long mask = dma_get_seg_boundary(dev); @@ -1389,28 +1409,30 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, size_t s_length = s->length; size_t pad_len = (mask - iova_len + 1) & mask; - if (is_pci_p2pdma_page(sg_page(s))) { - map = pci_p2pdma_map_segment(&p2pdma_state, dev, s); - switch (map) { - case PCI_P2PDMA_MAP_BUS_ADDR: - /* - * iommu_map_sg() will skip this segment as - * it is marked as a bus address, - * __finalise_sg() will copy the dma address - * into the output segment. - */ - continue; - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * Mapping through host bridge should be - * mapped with regular IOVAs, thus we - * do nothing here and continue below. - */ - break; - default: - ret = -EREMOTEIO; - goto out_restore_sg; - } + switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(s))) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Mapping through host bridge should be mapped with + * regular IOVAs, thus we do nothing here and continue + * below. + */ + break; + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * iommu_map_sg() will skip this segment as it is marked + * as a bus address, __finalise_sg() will copy the dma + * address into the output segment. + */ + s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, + sg_phys(s)); + sg_dma_len(s) = sg->length; + sg_dma_mark_bus_address(s); + continue; + default: + ret = -EREMOTEIO; + goto out_restore_sg; } sg_dma_address(s) = s_iova_off; @@ -1721,6 +1743,354 @@ size_t iommu_dma_max_mapping_size(struct device *dev) return SIZE_MAX; } +/** + * dma_iova_try_alloc - Try to allocate an IOVA space + * @dev: Device to allocate the IOVA space for + * @state: IOVA state + * @phys: physical address + * @size: IOVA size + * + * Check if @dev supports the IOVA-based DMA API, and if yes allocate IOVA space + * for the given base address and size. + * + * Note: @phys is only used to calculate the IOVA alignment. Callers that always + * do PAGE_SIZE aligned transfers can safely pass 0 here. + * + * Returns %true if the IOVA-based DMA API can be used and IOVA space has been + * allocated, or %false if the regular DMA API should be used. + */ +bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t size) +{ + struct iommu_dma_cookie *cookie; + struct iommu_domain *domain; + struct iova_domain *iovad; + size_t iova_off; + dma_addr_t addr; + + memset(state, 0, sizeof(*state)); + if (!use_dma_iommu(dev)) + return false; + + domain = iommu_get_dma_domain(dev); + cookie = domain->iova_cookie; + iovad = &cookie->iovad; + iova_off = iova_offset(iovad, phys); + + if (static_branch_unlikely(&iommu_deferred_attach_enabled) && + iommu_deferred_attach(dev, iommu_get_domain_for_dev(dev))) + return false; + + if (WARN_ON_ONCE(!size)) + return false; + + /* + * DMA_IOVA_USE_SWIOTLB is flag which is set by dma-iommu + * internals, make sure that caller didn't set it and/or + * didn't use this interface to map SIZE_MAX. + */ + if (WARN_ON_ONCE((u64)size & DMA_IOVA_USE_SWIOTLB)) + return false; + + addr = iommu_dma_alloc_iova(domain, + iova_align(iovad, size + iova_off), + dma_get_mask(dev), dev); + if (!addr) + return false; + + state->addr = addr + iova_off; + state->__size = size; + return true; +} +EXPORT_SYMBOL_GPL(dma_iova_try_alloc); + +/** + * dma_iova_free - Free an IOVA space + * @dev: Device to free the IOVA space for + * @state: IOVA state + * + * Undoes a successful dma_try_iova_alloc(). + * + * Note that all dma_iova_link() calls need to be undone first. For callers + * that never call dma_iova_unlink(), dma_iova_destroy() can be used instead + * which unlinks all ranges and frees the IOVA space in a single efficient + * operation. + */ +void dma_iova_free(struct device *dev, struct dma_iova_state *state) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, state->addr); + size_t size = dma_iova_size(state); + + iommu_dma_free_iova(domain, state->addr - iova_start_pad, + iova_align(iovad, size + iova_start_pad), NULL); +} +EXPORT_SYMBOL_GPL(dma_iova_free); + +static int __dma_iova_link(struct device *dev, dma_addr_t addr, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + bool coherent = dev_is_dma_coherent(dev); + + if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(phys, size, dir); + + return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size, + dma_info_to_prot(dir, coherent, attrs), GFP_ATOMIC); +} + +static int iommu_dma_iova_bounce_and_link(struct device *dev, dma_addr_t addr, + phys_addr_t phys, size_t bounce_len, + enum dma_data_direction dir, unsigned long attrs, + size_t iova_start_pad) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iova_domain *iovad = &domain->iova_cookie->iovad; + phys_addr_t bounce_phys; + int error; + + bounce_phys = iommu_dma_map_swiotlb(dev, phys, bounce_len, dir, attrs); + if (bounce_phys == DMA_MAPPING_ERROR) + return -ENOMEM; + + error = __dma_iova_link(dev, addr - iova_start_pad, + bounce_phys - iova_start_pad, + iova_align(iovad, bounce_len), dir, attrs); + if (error) + swiotlb_tbl_unmap_single(dev, bounce_phys, bounce_len, dir, + attrs); + return error; +} + +static int iommu_dma_iova_link_swiotlb(struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t offset, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, phys); + size_t iova_end_pad = iova_offset(iovad, phys + size); + dma_addr_t addr = state->addr + offset; + size_t mapped = 0; + int error; + + if (iova_start_pad) { + size_t bounce_len = min(size, iovad->granule - iova_start_pad); + + error = iommu_dma_iova_bounce_and_link(dev, addr, phys, + bounce_len, dir, attrs, iova_start_pad); + if (error) + return error; + state->__size |= DMA_IOVA_USE_SWIOTLB; + + mapped += bounce_len; + size -= bounce_len; + if (!size) + return 0; + } + + size -= iova_end_pad; + error = __dma_iova_link(dev, addr + mapped, phys + mapped, size, dir, + attrs); + if (error) + goto out_unmap; + mapped += size; + + if (iova_end_pad) { + error = iommu_dma_iova_bounce_and_link(dev, addr + mapped, + phys + mapped, iova_end_pad, dir, attrs, 0); + if (error) + goto out_unmap; + state->__size |= DMA_IOVA_USE_SWIOTLB; + } + + return 0; + +out_unmap: + dma_iova_unlink(dev, state, 0, mapped, dir, attrs); + return error; +} + +/** + * dma_iova_link - Link a range of IOVA space + * @dev: DMA device + * @state: IOVA state + * @phys: physical address to link + * @offset: offset into the IOVA state to map into + * @size: size of the buffer + * @dir: DMA direction + * @attrs: attributes of mapping properties + * + * Link a range of IOVA space for the given IOVA state without IOTLB sync. + * This function is used to link multiple physical addresses in contiguous + * IOVA space without performing costly IOTLB sync. + * + * The caller is responsible to call to dma_iova_sync() to sync IOTLB at + * the end of linkage. + */ +int dma_iova_link(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, phys); + + if (WARN_ON_ONCE(iova_start_pad && offset > 0)) + return -EIO; + + if (dev_use_swiotlb(dev, size, dir) && + iova_unaligned(iovad, phys, size)) + return iommu_dma_iova_link_swiotlb(dev, state, phys, offset, + size, dir, attrs); + + return __dma_iova_link(dev, state->addr + offset - iova_start_pad, + phys - iova_start_pad, + iova_align(iovad, size + iova_start_pad), dir, attrs); +} +EXPORT_SYMBOL_GPL(dma_iova_link); + +/** + * dma_iova_sync - Sync IOTLB + * @dev: DMA device + * @state: IOVA state + * @offset: offset into the IOVA state to sync + * @size: size of the buffer + * + * Sync IOTLB for the given IOVA state. This function should be called on + * the IOVA-contiguous range created by one ore more dma_iova_link() calls + * to sync the IOTLB. + */ +int dma_iova_sync(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + dma_addr_t addr = state->addr + offset; + size_t iova_start_pad = iova_offset(iovad, addr); + + return iommu_sync_map(domain, addr - iova_start_pad, + iova_align(iovad, size + iova_start_pad)); +} +EXPORT_SYMBOL_GPL(dma_iova_sync); + +static void iommu_dma_iova_unlink_range_slow(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, addr); + dma_addr_t end = addr + size; + + do { + phys_addr_t phys; + size_t len; + + phys = iommu_iova_to_phys(domain, addr); + if (WARN_ON(!phys)) + /* Something very horrible happen here */ + return; + + len = min_t(size_t, + end - addr, iovad->granule - iova_start_pad); + + if (!dev_is_dma_coherent(dev) && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_cpu(phys, len, dir); + + swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs); + + addr += len; + iova_start_pad = 0; + } while (addr < end); +} + +static void __iommu_dma_iova_unlink(struct device *dev, + struct dma_iova_state *state, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs, + bool free_iova) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + dma_addr_t addr = state->addr + offset; + size_t iova_start_pad = iova_offset(iovad, addr); + struct iommu_iotlb_gather iotlb_gather; + size_t unmapped; + + if ((state->__size & DMA_IOVA_USE_SWIOTLB) || + (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))) + iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs); + + iommu_iotlb_gather_init(&iotlb_gather); + iotlb_gather.queued = free_iova && READ_ONCE(cookie->fq_domain); + + size = iova_align(iovad, size + iova_start_pad); + addr -= iova_start_pad; + unmapped = iommu_unmap_fast(domain, addr, size, &iotlb_gather); + WARN_ON(unmapped != size); + + if (!iotlb_gather.queued) + iommu_iotlb_sync(domain, &iotlb_gather); + if (free_iova) + iommu_dma_free_iova(domain, addr, size, &iotlb_gather); +} + +/** + * dma_iova_unlink - Unlink a range of IOVA space + * @dev: DMA device + * @state: IOVA state + * @offset: offset into the IOVA state to unlink + * @size: size of the buffer + * @dir: DMA direction + * @attrs: attributes of mapping properties + * + * Unlink a range of IOVA space for the given IOVA state. + */ +void dma_iova_unlink(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + __iommu_dma_iova_unlink(dev, state, offset, size, dir, attrs, false); +} +EXPORT_SYMBOL_GPL(dma_iova_unlink); + +/** + * dma_iova_destroy - Finish a DMA mapping transaction + * @dev: DMA device + * @state: IOVA state + * @mapped_len: number of bytes to unmap + * @dir: DMA direction + * @attrs: attributes of mapping properties + * + * Unlink the IOVA range up to @mapped_len and free the entire IOVA space. The + * range of IOVA from dma_addr to @mapped_len must all be linked, and be the + * only linked IOVA in state. + */ +void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, + size_t mapped_len, enum dma_data_direction dir, + unsigned long attrs) +{ + if (mapped_len) + __iommu_dma_iova_unlink(dev, state, 0, mapped_len, dir, attrs, + true); + else + /* + * We can be here if first call to dma_iova_link() failed and + * there is nothing to unlink, so let's be more clear. + */ + dma_iova_free(dev, state); +} +EXPORT_SYMBOL_GPL(dma_iova_destroy); + void iommu_setup_dma_ops(struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9d728800a862..6c02f93422ce 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2443,8 +2443,8 @@ out_set_count: return pgsize; } -static int __iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; @@ -2453,12 +2453,19 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t orig_paddr = paddr; int ret = 0; + might_sleep_if(gfpflags_allow_blocking(gfp)); + if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) return -ENODEV; + /* Discourage passing strange GFP flags */ + if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | + __GFP_HIGHMEM))) + return -EINVAL; + /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2506,31 +2513,27 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, return ret; } -int iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { const struct iommu_domain_ops *ops = domain->ops; - int ret; - might_sleep_if(gfpflags_allow_blocking(gfp)); - - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; + if (!ops->iotlb_sync_map) + return 0; + return ops->iotlb_sync_map(domain, iova, size); +} - ret = __iommu_map(domain, iova, paddr, size, prot, gfp); - if (ret == 0 && ops->iotlb_sync_map) { - ret = ops->iotlb_sync_map(domain, iova, size); - if (ret) - goto out_err; - } +int iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + int ret; - return ret; + ret = iommu_map_nosync(domain, iova, paddr, size, prot, gfp); + if (ret) + return ret; -out_err: - /* undo mappings already done */ - iommu_unmap(domain, iova, size); + ret = iommu_sync_map(domain, iova, size); + if (ret) + iommu_unmap(domain, iova, size); return ret; } @@ -2618,6 +2621,25 @@ size_t iommu_unmap(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_unmap); +/** + * iommu_unmap_fast() - Remove mappings from a range of IOVA without IOTLB sync + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the range starting from @iova + * @iotlb_gather: range information for a pending IOTLB flush + * + * iommu_unmap_fast() will remove a translation created by iommu_map(). + * It can't subdivide a mapping created by iommu_map(), so it should be + * called with IOVA ranges that match what was passed to iommu_map(). The + * range can aggregate contiguous iommu_map() calls so long as no individual + * range is split. + * + * Basically iommu_unmap_fast() is the same as iommu_unmap() but for callers + * which manage the IOTLB flushing externally to perform a batched sync. + * + * Returns: Number of bytes of IOVA unmapped. iova + res will be the point + * unmapping stopped. + */ size_t iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) @@ -2630,26 +2652,17 @@ ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot, gfp_t gfp) { - const struct iommu_domain_ops *ops = domain->ops; size_t len = 0, mapped = 0; phys_addr_t start; unsigned int i = 0; int ret; - might_sleep_if(gfpflags_allow_blocking(gfp)); - - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; - while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); if (len && s_phys != start + len) { - ret = __iommu_map(domain, iova + mapped, start, + ret = iommu_map_nosync(domain, iova + mapped, start, len, prot, gfp); - if (ret) goto out_err; @@ -2672,11 +2685,10 @@ next: sg = sg_next(sg); } - if (ops->iotlb_sync_map) { - ret = ops->iotlb_sync_map(domain, iova, mapped); - if (ret) - goto out_err; - } + ret = iommu_sync_map(domain, iova, mapped); + if (ret) + goto out_err; + return mapped; out_err: diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index fcb199efbea5..4af60e2f37df 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1339,8 +1339,13 @@ ice_devlink_enable_roce_get(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; - ctx->val.vbool = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? true : false; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + + ctx->val.vbool = !!(cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_ROCEV2); return 0; } @@ -1350,19 +1355,24 @@ static int ice_devlink_enable_roce_set(struct devlink *devlink, u32 id, struct netlink_ext_ack *extack) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; bool roce_ena = ctx->val.vbool; int ret; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + if (!roce_ena) { ice_unplug_aux_dev(pf); - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_ROCEV2; return 0; } - pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->rdma_protocol |= IIDC_RDMA_PROTOCOL_ROCEV2; ret = ice_plug_aux_dev(pf); if (ret) - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_ROCEV2; return ret; } @@ -1373,11 +1383,16 @@ ice_devlink_enable_roce_validate(struct devlink *devlink, u32 id, struct netlink_ext_ack *extack) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; + + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; if (!test_bit(ICE_FLAG_RDMA_ENA, pf->flags)) return -EOPNOTSUPP; - if (pf->rdma_mode & IIDC_RDMA_PROTOCOL_IWARP) { + if (cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_IWARP) { NL_SET_ERR_MSG_MOD(extack, "iWARP is currently enabled. This device cannot enable iWARP and RoCEv2 simultaneously"); return -EOPNOTSUPP; } @@ -1390,8 +1405,13 @@ ice_devlink_enable_iw_get(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; - ctx->val.vbool = pf->rdma_mode & IIDC_RDMA_PROTOCOL_IWARP; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + + ctx->val.vbool = !!(cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_IWARP); return 0; } @@ -1401,19 +1421,24 @@ static int ice_devlink_enable_iw_set(struct devlink *devlink, u32 id, struct netlink_ext_ack *extack) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; bool iw_ena = ctx->val.vbool; int ret; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + if (!iw_ena) { ice_unplug_aux_dev(pf); - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_IWARP; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_IWARP; return 0; } - pf->rdma_mode |= IIDC_RDMA_PROTOCOL_IWARP; + cdev->rdma_protocol |= IIDC_RDMA_PROTOCOL_IWARP; ret = ice_plug_aux_dev(pf); if (ret) - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_IWARP; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_IWARP; return ret; } @@ -1428,7 +1453,7 @@ ice_devlink_enable_iw_validate(struct devlink *devlink, u32 id, if (!test_bit(ICE_FLAG_RDMA_ENA, pf->flags)) return -EOPNOTSUPP; - if (pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2) { + if (pf->cdev_info->rdma_protocol & IIDC_RDMA_PROTOCOL_ROCEV2) { NL_SET_ERR_MSG_MOD(extack, "RoCEv2 is currently enabled. This device cannot enable iWARP and RoCEv2 simultaneously"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index fd083647c14a..e27d9044bcb3 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -401,7 +401,6 @@ struct ice_vsi { u16 req_rxq; /* User requested Rx queues */ u16 num_rx_desc; u16 num_tx_desc; - u16 qset_handle[ICE_MAX_TRAFFIC_CLASS]; struct ice_tc_cfg tc_cfg; struct bpf_prog *xdp_prog; struct ice_tx_ring **xdp_rings; /* XDP ring array */ @@ -557,7 +556,6 @@ struct ice_pf { struct devlink_port devlink_port; /* OS reserved IRQ details */ - struct msix_entry *msix_entries; struct ice_irq_tracker irq_tracker; struct ice_virt_irq_tracker virt_irq_tracker; @@ -592,7 +590,6 @@ struct ice_pf { struct gnss_serial *gnss_serial; struct gnss_device *gnss_dev; u16 num_rdma_msix; /* Total MSIX vectors for RDMA driver */ - u16 rdma_base_vector; /* spinlock to protect the AdminQ wait list */ spinlock_t aq_wait_lock; @@ -625,14 +622,12 @@ struct ice_pf { struct ice_hw_port_stats stats_prev; struct ice_hw hw; u8 stat_prev_loaded:1; /* has previous stats been loaded */ - u8 rdma_mode; u16 dcbx_cap; u32 tx_timeout_count; unsigned long tx_timeout_last_recovery; u32 tx_timeout_recovery_level; char int_name[ICE_INT_NAME_STR_LEN]; char int_name_ll_ts[ICE_INT_NAME_STR_LEN]; - struct auxiliary_device *adev; int aux_idx; u32 sw_int_count; /* count of tc_flower filters specific to channel (aka where filter @@ -664,6 +659,7 @@ struct ice_pf { struct ice_dplls dplls; struct device *hwmon_dev; struct ice_health health_reporters; + struct iidc_rdma_core_dev_info *cdev_info; u8 num_quanta_prof_used; }; diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c index 74418c445cc4..64737fc62306 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb.c @@ -1288,7 +1288,7 @@ ice_add_dscp_up_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg) tlv->ouisubtype = htonl(ouisubtype); /* bytes 0 - 63 - IPv4 DSCP2UP LUT */ - for (i = 0; i < ICE_DSCP_NUM_VAL; i++) { + for (i = 0; i < DSCP_MAX; i++) { /* IPv4 mapping */ buf[i] = dcbcfg->dscp_map[i]; /* IPv6 mapping */ diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index a7c510832824..c5ef33c100d6 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -352,8 +352,8 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) struct ice_aqc_port_ets_elem buf = { 0 }; struct ice_dcbx_cfg *old_cfg, *curr_cfg; struct device *dev = ice_pf_to_dev(pf); + struct iidc_rdma_event *event; int ret = ICE_DCB_NO_HW_CHG; - struct iidc_event *event; struct ice_vsi *pf_vsi; curr_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; @@ -405,7 +405,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) goto free_cfg; } - set_bit(IIDC_EVENT_BEFORE_TC_CHANGE, event->type); + set_bit(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, event->type); ice_send_event_to_aux(pf, event); kfree(event); @@ -740,7 +740,9 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf) void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; - struct iidc_event *event; + struct iidc_rdma_priv_dev_info *privd; + struct iidc_rdma_core_dev_info *cdev; + struct iidc_rdma_event *event; u8 tc_map = 0; int v, ret; @@ -783,13 +785,17 @@ void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) if (vsi->type == ICE_VSI_PF) ice_dcbnl_set_all(vsi); } - if (!locked) { + + cdev = pf->cdev_info; + if (cdev && !locked) { + privd = cdev->iidc_priv; + ice_setup_dcb_qos_info(pf, &privd->qos_info); /* Notify the AUX drivers that TC change is finished */ event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return; - set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); + set_bit(IIDC_RDMA_EVENT_AFTER_TC_CHANGE, event->type); ice_send_event_to_aux(pf, event); kfree(event); } @@ -945,6 +951,37 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring, } /** + * ice_setup_dcb_qos_info - Setup DCB QoS information + * @pf: ptr to ice_pf + * @qos_info: QoS param instance + */ +void ice_setup_dcb_qos_info(struct ice_pf *pf, struct iidc_rdma_qos_params *qos_info) +{ + struct ice_dcbx_cfg *dcbx_cfg; + unsigned int i; + u32 up2tc; + + if (!pf || !qos_info) + return; + + dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; + up2tc = rd32(&pf->hw, PRTDCB_TUP2TC); + + qos_info->num_tc = ice_dcb_get_num_tc(dcbx_cfg); + + for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) + qos_info->up2tc[i] = (up2tc >> (i * 3)) & 0x7; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) + qos_info->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i]; + + qos_info->pfc_mode = dcbx_cfg->pfc_mode; + if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) + for (i = 0; i < DSCP_MAX; i++) + qos_info->dscp_map[i] = dcbx_cfg->dscp_map[i]; +} + +/** * ice_dcb_is_mib_change_pending - Check if MIB change is pending * @state: MIB change state */ diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h index 800879a88c5e..da9ba814b4e8 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h @@ -31,6 +31,9 @@ void ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first); void +ice_setup_dcb_qos_info(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos_info); +void ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, struct ice_rq_event_info *event); /** @@ -134,5 +137,11 @@ static inline void ice_update_dcb_stats(struct ice_pf *pf) { } static inline void ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, struct ice_rq_event_info *event) { } static inline void ice_set_cgd_num(struct ice_tlan_ctx *tlan_ctx, u8 dcb_tc) { } +static inline void +ice_setup_dcb_qos_info(struct ice_pf *pf, struct iidc_rdma_qos_params *qos_info) +{ + qos_info->num_tc = 1; + qos_info->tc_info[0].rel_bw = 100; +} #endif /* CONFIG_DCB */ #endif /* _ICE_DCB_LIB_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 6d50b90a7359..a10c1c8d8697 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -754,7 +754,7 @@ static int ice_dcbnl_setapp(struct net_device *netdev, struct dcb_app *app) if (!ice_is_feature_supported(pf, ICE_F_DSCP)) return -EOPNOTSUPP; - if (app->protocol >= ICE_DSCP_NUM_VAL) { + if (app->protocol >= DSCP_MAX) { netdev_err(netdev, "DSCP value 0x%04X out of range\n", app->protocol); return -EINVAL; @@ -931,7 +931,7 @@ static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app) /* if the last DSCP mapping just got deleted, need to switch * to L2 VLAN QoS mode */ - if (bitmap_empty(new_cfg->dscp_mapped, ICE_DSCP_NUM_VAL) && + if (bitmap_empty(new_cfg->dscp_mapped, DSCP_MAX) && new_cfg->pfc_mode == ICE_QOS_MODE_DSCP) { ret = ice_aq_set_pfc_mode(&pf->hw, ICE_AQC_PFC_VLAN_BASED_PFC, diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 7c2dc347e4e5..46fbcd391a80 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3964,11 +3964,11 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) return -EINVAL; } - if (pf->adev) { + if (pf->cdev_info && pf->cdev_info->adev) { mutex_lock(&pf->adev_mutex); - device_lock(&pf->adev->dev); + device_lock(&pf->cdev_info->adev->dev); locked = true; - if (pf->adev->dev.driver) { + if (pf->cdev_info->adev->dev.driver) { netdev_err(dev, "Cannot change channels when RDMA is active\n"); ret = -EBUSY; goto adev_unlock; @@ -3987,7 +3987,7 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) adev_unlock: if (locked) { - device_unlock(&pf->adev->dev); + device_unlock(&pf->cdev_info->adev->dev); mutex_unlock(&pf->adev_mutex); } return ret; diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index bab3e81cad5d..6ab53e430f91 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -9,22 +9,25 @@ static DEFINE_XARRAY_ALLOC1(ice_aux_id); /** - * ice_get_auxiliary_drv - retrieve iidc_auxiliary_drv struct - * @pf: pointer to PF struct + * ice_get_auxiliary_drv - retrieve iidc_rdma_core_auxiliary_drv struct + * @cdev: pointer to iidc_rdma_core_dev_info struct * * This function has to be called with a device_lock on the - * pf->adev.dev to avoid race conditions. + * cdev->adev.dev to avoid race conditions. + * + * Return: pointer to the matched auxiliary driver struct */ -static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) +static struct iidc_rdma_core_auxiliary_drv * +ice_get_auxiliary_drv(struct iidc_rdma_core_dev_info *cdev) { struct auxiliary_device *adev; - adev = pf->adev; + adev = cdev->adev; if (!adev || !adev->dev.driver) return NULL; - return container_of(adev->dev.driver, struct iidc_auxiliary_drv, - adrv.driver); + return container_of(adev->dev.driver, + struct iidc_rdma_core_auxiliary_drv, adrv.driver); } /** @@ -32,44 +35,54 @@ static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) * @pf: pointer to PF struct * @event: event struct */ -void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event) +void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_rdma_event *event) { - struct iidc_auxiliary_drv *iadrv; + struct iidc_rdma_core_auxiliary_drv *iadrv; + struct iidc_rdma_core_dev_info *cdev; if (WARN_ON_ONCE(!in_task())) return; + cdev = pf->cdev_info; + if (!cdev) + return; + mutex_lock(&pf->adev_mutex); - if (!pf->adev) + if (!cdev->adev) goto finish; - device_lock(&pf->adev->dev); - iadrv = ice_get_auxiliary_drv(pf); + device_lock(&cdev->adev->dev); + iadrv = ice_get_auxiliary_drv(cdev); if (iadrv && iadrv->event_handler) - iadrv->event_handler(pf, event); - device_unlock(&pf->adev->dev); + iadrv->event_handler(cdev, event); + device_unlock(&cdev->adev->dev); finish: mutex_unlock(&pf->adev_mutex); } /** * ice_add_rdma_qset - Add Leaf Node for RDMA Qset - * @pf: PF struct + * @cdev: pointer to iidc_rdma_core_dev_info struct * @qset: Resource to be allocated + * + * Return: Zero on success or error code encountered */ -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) +int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset) { u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS]; struct ice_vsi *vsi; struct device *dev; + struct ice_pf *pf; u32 qset_teid; u16 qs_handle; int status; int i; - if (WARN_ON(!pf || !qset)) + if (WARN_ON(!cdev || !qset)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); dev = ice_pf_to_dev(pf); if (!ice_is_rdma_ena(pf)) @@ -100,7 +113,6 @@ int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) dev_err(dev, "Failed VSI RDMA Qset enable\n"); return status; } - vsi->qset_handle[qset->tc] = qset->qs_handle; qset->teid = qset_teid; return 0; @@ -109,18 +121,23 @@ EXPORT_SYMBOL_GPL(ice_add_rdma_qset); /** * ice_del_rdma_qset - Delete leaf node for RDMA Qset - * @pf: PF struct + * @cdev: pointer to iidc_rdma_core_dev_info struct * @qset: Resource to be freed + * + * Return: Zero on success, error code on failure */ -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) +int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset) { struct ice_vsi *vsi; + struct ice_pf *pf; u32 teid; u16 q_id; - if (WARN_ON(!pf || !qset)) + if (WARN_ON(!cdev || !qset)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); vsi = ice_find_vsi(pf, qset->vport_id); if (!vsi) { dev_err(ice_pf_to_dev(pf), "RDMA Invalid VSI\n"); @@ -130,36 +147,36 @@ int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) q_id = qset->qs_handle; teid = qset->teid; - vsi->qset_handle[qset->tc] = 0; - return ice_dis_vsi_rdma_qset(vsi->port_info, 1, &teid, &q_id); } EXPORT_SYMBOL_GPL(ice_del_rdma_qset); /** * ice_rdma_request_reset - accept request from RDMA to perform a reset - * @pf: struct for PF + * @cdev: pointer to iidc_rdma_core_dev_info struct * @reset_type: type of reset + * + * Return: Zero on success, error code on failure */ -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type) +int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev, + enum iidc_rdma_reset_type reset_type) { enum ice_reset_req reset; + struct ice_pf *pf; - if (WARN_ON(!pf)) + if (WARN_ON(!cdev)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); + switch (reset_type) { - case IIDC_PFR: + case IIDC_FUNC_RESET: reset = ICE_RESET_PFR; break; - case IIDC_CORER: + case IIDC_DEV_RESET: reset = ICE_RESET_CORER; break; - case IIDC_GLOBR: - reset = ICE_RESET_GLOBR; - break; default: - dev_err(ice_pf_to_dev(pf), "incorrect reset request\n"); return -EINVAL; } @@ -169,18 +186,23 @@ EXPORT_SYMBOL_GPL(ice_rdma_request_reset); /** * ice_rdma_update_vsi_filter - update main VSI filters for RDMA - * @pf: pointer to struct for PF + * @cdev: pointer to iidc_rdma_core_dev_info struct * @vsi_id: VSI HW idx to update filter on * @enable: bool whether to enable or disable filters + * + * Return: Zero on success, error code on failure */ -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable) +int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, + u16 vsi_id, bool enable) { struct ice_vsi *vsi; + struct ice_pf *pf; int status; - if (WARN_ON(!pf)) + if (WARN_ON(!cdev)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); vsi = ice_find_vsi(pf, vsi_id); if (!vsi) return -EINVAL; @@ -201,37 +223,23 @@ int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable) EXPORT_SYMBOL_GPL(ice_rdma_update_vsi_filter); /** - * ice_get_qos_params - parse QoS params for RDMA consumption - * @pf: pointer to PF struct - * @qos: set of QoS values + * ice_alloc_rdma_qvector - alloc vector resources reserved for RDMA driver + * @cdev: pointer to iidc_rdma_core_dev_info struct + * @entry: MSI-X entry to be removed + * + * Return: Zero on success, error code on failure */ -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos) +int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry) { - struct ice_dcbx_cfg *dcbx_cfg; - unsigned int i; - u32 up2tc; - - dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; - up2tc = rd32(&pf->hw, PRTDCB_TUP2TC); - - qos->num_tc = ice_dcb_get_num_tc(dcbx_cfg); - for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) - qos->up2tc[i] = (up2tc >> (i * 3)) & 0x7; - - for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) - qos->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i]; - - qos->pfc_mode = dcbx_cfg->pfc_mode; - if (qos->pfc_mode == IIDC_DSCP_PFC_MODE) - for (i = 0; i < IIDC_MAX_DSCP_MAPPING; i++) - qos->dscp_map[i] = dcbx_cfg->dscp_map[i]; -} -EXPORT_SYMBOL_GPL(ice_get_qos_params); + struct msi_map map; + struct ice_pf *pf; -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry) -{ - struct msi_map map = ice_alloc_irq(pf, true); + if (WARN_ON(!cdev)) + return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); + map = ice_alloc_irq(pf, true); if (map.index < 0) return -ENOMEM; @@ -244,12 +252,19 @@ EXPORT_SYMBOL_GPL(ice_alloc_rdma_qvector); /** * ice_free_rdma_qvector - free vector resources reserved for RDMA driver - * @pf: board private structure to initialize + * @cdev: pointer to iidc_rdma_core_dev_info struct * @entry: MSI-X entry to be removed */ -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry) +void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry) { struct msi_map map; + struct ice_pf *pf; + + if (WARN_ON(!cdev || !entry)) + return; + + pf = pci_get_drvdata(cdev->pdev); map.index = entry->entry; map.virq = entry->vector; @@ -263,19 +278,23 @@ EXPORT_SYMBOL_GPL(ice_free_rdma_qvector); */ static void ice_adev_release(struct device *dev) { - struct iidc_auxiliary_dev *iadev; + struct iidc_rdma_core_auxiliary_dev *iadev; - iadev = container_of(dev, struct iidc_auxiliary_dev, adev.dev); + iadev = container_of(dev, struct iidc_rdma_core_auxiliary_dev, + adev.dev); kfree(iadev); } /** * ice_plug_aux_dev - allocate and register AUX device * @pf: pointer to pf struct + * + * Return: Zero on success, error code on failure */ int ice_plug_aux_dev(struct ice_pf *pf) { - struct iidc_auxiliary_dev *iadev; + struct iidc_rdma_core_auxiliary_dev *iadev; + struct iidc_rdma_core_dev_info *cdev; struct auxiliary_device *adev; int ret; @@ -285,17 +304,22 @@ int ice_plug_aux_dev(struct ice_pf *pf) if (!ice_is_rdma_ena(pf)) return 0; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + iadev = kzalloc(sizeof(*iadev), GFP_KERNEL); if (!iadev) return -ENOMEM; adev = &iadev->adev; - iadev->pf = pf; + iadev->cdev_info = cdev; adev->id = pf->aux_idx; adev->dev.release = ice_adev_release; adev->dev.parent = &pf->pdev->dev; - adev->name = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? "roce" : "iwarp"; + adev->name = cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_ROCEV2 ? + "roce" : "iwarp"; ret = auxiliary_device_init(adev); if (ret) { @@ -310,7 +334,7 @@ int ice_plug_aux_dev(struct ice_pf *pf) } mutex_lock(&pf->adev_mutex); - pf->adev = adev; + cdev->adev = adev; mutex_unlock(&pf->adev_mutex); return 0; @@ -324,8 +348,8 @@ void ice_unplug_aux_dev(struct ice_pf *pf) struct auxiliary_device *adev; mutex_lock(&pf->adev_mutex); - adev = pf->adev; - pf->adev = NULL; + adev = pf->cdev_info->adev; + pf->cdev_info->adev = NULL; mutex_unlock(&pf->adev_mutex); if (adev) { @@ -340,7 +364,9 @@ void ice_unplug_aux_dev(struct ice_pf *pf) */ int ice_init_rdma(struct ice_pf *pf) { + struct iidc_rdma_priv_dev_info *privd; struct device *dev = &pf->pdev->dev; + struct iidc_rdma_core_dev_info *cdev; int ret; if (!ice_is_rdma_ena(pf)) { @@ -348,22 +374,50 @@ int ice_init_rdma(struct ice_pf *pf) return 0; } + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) + return -ENOMEM; + + pf->cdev_info = cdev; + + privd = kzalloc(sizeof(*privd), GFP_KERNEL); + if (!privd) { + ret = -ENOMEM; + goto err_privd_alloc; + } + + privd->pf_id = pf->hw.pf_id; ret = xa_alloc(&ice_aux_id, &pf->aux_idx, NULL, XA_LIMIT(1, INT_MAX), GFP_KERNEL); if (ret) { dev_err(dev, "Failed to allocate device ID for AUX driver\n"); - return -ENOMEM; + ret = -ENOMEM; + goto err_alloc_xa; } - pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->iidc_priv = privd; + privd->netdev = pf->vsi[0]->netdev; + + privd->hw_addr = (u8 __iomem *)pf->hw.hw_addr; + cdev->pdev = pf->pdev; + privd->vport_id = pf->vsi[0]->vsi_num; + + pf->cdev_info->rdma_protocol |= IIDC_RDMA_PROTOCOL_ROCEV2; + ice_setup_dcb_qos_info(pf, &privd->qos_info); ret = ice_plug_aux_dev(pf); if (ret) goto err_plug_aux_dev; return 0; err_plug_aux_dev: - pf->adev = NULL; + pf->cdev_info->adev = NULL; xa_erase(&ice_aux_id, pf->aux_idx); +err_alloc_xa: + kfree(privd); +err_privd_alloc: + kfree(cdev); + pf->cdev_info = NULL; + return ret; } @@ -378,4 +432,7 @@ void ice_deinit_rdma(struct ice_pf *pf) ice_unplug_aux_dev(pf); xa_erase(&ice_aux_id, pf->aux_idx); + kfree(pf->cdev_info->iidc_priv); + kfree(pf->cdev_info); + pf->cdev_info = NULL; } diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h index 4b0c86757df9..17dbfcfb6a2a 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc_int.h +++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h @@ -4,10 +4,11 @@ #ifndef _ICE_IDC_INT_H_ #define _ICE_IDC_INT_H_ -#include <linux/net/intel/iidc.h> +#include <linux/net/intel/iidc_rdma.h> +#include <linux/net/intel/iidc_rdma_ice.h> struct ice_pf; -void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event); +void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_rdma_event *event); #endif /* !_ICE_IDC_INT_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d390157b59fe..eace0e3f15e8 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2401,11 +2401,11 @@ static void ice_service_task(struct work_struct *work) } if (test_and_clear_bit(ICE_AUX_ERR_PENDING, pf->state)) { - struct iidc_event *event; + struct iidc_rdma_event *event; event = kzalloc(sizeof(*event), GFP_KERNEL); if (event) { - set_bit(IIDC_EVENT_CRIT_ERR, event->type); + set_bit(IIDC_RDMA_EVENT_CRIT_ERR, event->type); /* report the entire OICR value to AUX driver */ swap(event->reg, pf->oicr_err_reg); ice_send_event_to_aux(pf, event); @@ -2424,11 +2424,11 @@ static void ice_service_task(struct work_struct *work) ice_plug_aux_dev(pf); if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { - struct iidc_event *event; + struct iidc_rdma_event *event; event = kzalloc(sizeof(*event), GFP_KERNEL); if (event) { - set_bit(IIDC_EVENT_AFTER_MTU_CHANGE, event->type); + set_bit(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, event->type); ice_send_event_to_aux(pf, event); kfree(event); } @@ -9310,6 +9310,7 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, void *type_data) { struct ice_netdev_priv *np = netdev_priv(netdev); + struct iidc_rdma_core_dev_info *cdev; struct ice_pf *pf = np->vsi->back; bool locked = false; int err; @@ -9326,11 +9327,12 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, return -EOPNOTSUPP; } - if (pf->adev) { + cdev = pf->cdev_info; + if (cdev && cdev->adev) { mutex_lock(&pf->adev_mutex); - device_lock(&pf->adev->dev); + device_lock(&cdev->adev->dev); locked = true; - if (pf->adev->dev.driver) { + if (cdev->adev->dev.driver) { netdev_err(netdev, "Cannot change qdisc when RDMA is active\n"); err = -EBUSY; goto adev_unlock; @@ -9344,7 +9346,7 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, adev_unlock: if (locked) { - device_unlock(&pf->adev->dev); + device_unlock(&cdev->adev->dev); mutex_unlock(&pf->adev_mutex); } return err; diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 0aab21113cc4..529f978ea45a 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -19,6 +19,7 @@ #include "ice_vlan_mode.h" #include "ice_fwlog.h" #include <linux/wait.h> +#include <net/dscp.h> static inline bool ice_is_tc_ena(unsigned long bitmap, u8 tc) { @@ -695,7 +696,6 @@ struct ice_dcb_app_priority_table { #define ICE_MAX_USER_PRIORITY 8 #define ICE_DCBX_MAX_APPS 64 -#define ICE_DSCP_NUM_VAL 64 #define ICE_LLDPDU_SIZE 1500 #define ICE_TLV_STATUS_OPER 0x1 #define ICE_TLV_STATUS_SYNC 0x2 @@ -718,9 +718,9 @@ struct ice_dcbx_cfg { u8 pfc_mode; struct ice_dcb_app_priority_table app[ICE_DCBX_MAX_APPS]; /* when DSCP mapping defined by user set its bit to 1 */ - DECLARE_BITMAP(dscp_mapped, ICE_DSCP_NUM_VAL); + DECLARE_BITMAP(dscp_mapped, DSCP_MAX); /* array holding DSCP -> UP/TC values for DSCP L3 QoS mode */ - u8 dscp_map[ICE_DSCP_NUM_VAL]; + u8 dscp_map[DSCP_MAX]; u8 dcbx_mode; #define ICE_DCBX_MODE_CEE 0x1 #define ICE_DCBX_MODE_IEEE 0x2 diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 4ffaf7588885..3504507477c6 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -391,6 +391,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) case GDMA_EQE_HWC_INIT_EQ_ID_DB: case GDMA_EQE_HWC_INIT_DATA: case GDMA_EQE_HWC_INIT_DONE: + case GDMA_EQE_HWC_SOC_SERVICE: case GDMA_EQE_RNIC_QP_FATAL: if (!eq->eq.callback) break; @@ -964,6 +965,7 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev) err, resp.hdr.status); return err ? err : -EPROTO; } + gc->pf_cap_flags1 = resp.pf_cap_flags1; if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) { err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout); if (err) { @@ -1004,7 +1006,6 @@ int mana_gd_register_device(struct gdma_dev *gd) return 0; } -EXPORT_SYMBOL_NS(mana_gd_register_device, "NET_MANA"); int mana_gd_deregister_device(struct gdma_dev *gd) { @@ -1035,7 +1036,6 @@ int mana_gd_deregister_device(struct gdma_dev *gd) return err; } -EXPORT_SYMBOL_NS(mana_gd_deregister_device, "NET_MANA"); u32 mana_gd_wq_avail_space(struct gdma_queue *wq) { @@ -1469,10 +1469,14 @@ static int mana_gd_setup(struct pci_dev *pdev) mana_gd_init_registers(pdev); mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base); + gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0); + if (!gc->service_wq) + return -ENOMEM; + err = mana_gd_setup_irqs(pdev); if (err) { dev_err(gc->dev, "Failed to setup IRQs: %d\n", err); - return err; + goto free_workqueue; } err = mana_hwc_create_channel(gc); @@ -1498,6 +1502,8 @@ destroy_hwc: mana_hwc_destroy_channel(gc); remove_irq: mana_gd_remove_irqs(pdev); +free_workqueue: + destroy_workqueue(gc->service_wq); dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err); return err; } @@ -1509,6 +1515,8 @@ static void mana_gd_cleanup(struct pci_dev *pdev) mana_hwc_destroy_channel(gc); mana_gd_remove_irqs(pdev); + + destroy_workqueue(gc->service_wq); dev_dbg(&pdev->dev, "mana gdma cleanup successful\n"); } @@ -1578,8 +1586,14 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto cleanup_gd; + err = mana_rdma_probe(&gc->mana_ib); + if (err) + goto cleanup_mana; + return 0; +cleanup_mana: + mana_remove(&gc->mana, false); cleanup_gd: mana_gd_cleanup(pdev); unmap_bar: @@ -1607,6 +1621,7 @@ static void mana_gd_remove(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); + mana_rdma_remove(&gc->mana_ib); mana_remove(&gc->mana, false); mana_gd_cleanup(pdev); @@ -1630,6 +1645,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) { struct gdma_context *gc = pci_get_drvdata(pdev); + mana_rdma_remove(&gc->mana_ib); mana_remove(&gc->mana, true); mana_gd_cleanup(pdev); @@ -1654,6 +1670,10 @@ static int mana_gd_resume(struct pci_dev *pdev) if (err) return err; + err = mana_rdma_probe(&gc->mana_ib); + if (err) + return err; + return 0; } @@ -1664,6 +1684,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev) dev_info(&pdev->dev, "Shutdown was called\n"); + mana_rdma_remove(&gc->mana_ib); mana_remove(&gc->mana, true); mana_gd_cleanup(pdev); diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 1ba49602089b..a8c4d8db75a5 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -112,11 +112,13 @@ out: static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self, struct gdma_event *event) { + union hwc_init_soc_service_type service_data; struct hw_channel_context *hwc = ctx; struct gdma_dev *gd = hwc->gdma_dev; union hwc_init_type_data type_data; union hwc_init_eq_id_db eq_db; u32 type, val; + int ret; switch (event->type) { case GDMA_EQE_HWC_INIT_EQ_ID_DB: @@ -199,7 +201,24 @@ static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self, } break; + case GDMA_EQE_HWC_SOC_SERVICE: + service_data.as_uint32 = event->details[0]; + type = service_data.type; + switch (type) { + case GDMA_SERVICE_TYPE_RDMA_SUSPEND: + case GDMA_SERVICE_TYPE_RDMA_RESUME: + ret = mana_rdma_service_event(gd->gdma_context, type); + if (ret) + dev_err(hwc->dev, "Failed to schedule adev service event: %d\n", + ret); + break; + default: + dev_warn(hwc->dev, "Received unknown SOC service type %u\n", type); + break; + } + + break; default: dev_warn(hwc->dev, "Received unknown gdma event %u\n", event->type); /* Ignore unknown events, which should never happen. */ diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 2bac6be8f6a0..1758281b2a51 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2945,7 +2945,7 @@ static void remove_adev(struct gdma_dev *gd) gd->adev = NULL; } -static int add_adev(struct gdma_dev *gd) +static int add_adev(struct gdma_dev *gd, const char *name) { struct auxiliary_device *adev; struct mana_adev *madev; @@ -2961,7 +2961,7 @@ static int add_adev(struct gdma_dev *gd) goto idx_fail; adev->id = ret; - adev->name = "rdma"; + adev->name = name; adev->dev.parent = gd->gdma_context->dev; adev->dev.release = adev_release; madev->mdev = gd; @@ -2993,6 +2993,70 @@ idx_fail: return ret; } +static void mana_rdma_service_handle(struct work_struct *work) +{ + struct mana_service_work *serv_work = + container_of(work, struct mana_service_work, work); + struct gdma_dev *gd = serv_work->gdma_dev; + struct device *dev = gd->gdma_context->dev; + int ret; + + if (READ_ONCE(gd->rdma_teardown)) + goto out; + + switch (serv_work->event) { + case GDMA_SERVICE_TYPE_RDMA_SUSPEND: + if (!gd->adev || gd->is_suspended) + break; + + remove_adev(gd); + gd->is_suspended = true; + break; + + case GDMA_SERVICE_TYPE_RDMA_RESUME: + if (!gd->is_suspended) + break; + + ret = add_adev(gd, "rdma"); + if (ret) + dev_err(dev, "Failed to add adev on resume: %d\n", ret); + else + gd->is_suspended = false; + break; + + default: + dev_warn(dev, "unknown adev service event %u\n", + serv_work->event); + break; + } + +out: + kfree(serv_work); +} + +int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event) +{ + struct gdma_dev *gd = &gc->mana_ib; + struct mana_service_work *serv_work; + + if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) { + /* RDMA device is not detected on pci */ + return 0; + } + + serv_work = kzalloc(sizeof(*serv_work), GFP_ATOMIC); + if (!serv_work) + return -ENOMEM; + + serv_work->event = event; + serv_work->gdma_dev = gd; + + INIT_WORK(&serv_work->work, mana_rdma_service_handle); + queue_work(gc->service_wq, &serv_work->work); + + return 0; +} + int mana_probe(struct gdma_dev *gd, bool resuming) { struct gdma_context *gc = gd->gdma_context; @@ -3077,7 +3141,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming) } } - err = add_adev(gd); + err = add_adev(gd, "eth"); out: if (err) { mana_remove(gd, false); @@ -3151,6 +3215,44 @@ out: dev_dbg(dev, "%s succeeded\n", __func__); } +int mana_rdma_probe(struct gdma_dev *gd) +{ + int err = 0; + + if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) { + /* RDMA device is not detected on pci */ + return err; + } + + err = mana_gd_register_device(gd); + if (err) + return err; + + err = add_adev(gd, "rdma"); + if (err) + mana_gd_deregister_device(gd); + + return err; +} + +void mana_rdma_remove(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + + if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) { + /* RDMA device is not detected on pci */ + return; + } + + WRITE_ONCE(gd->rdma_teardown, true); + flush_workqueue(gc->service_wq); + + if (gd->adev) + remove_adev(gd); + + mana_gd_deregister_device(gd); +} + struct net_device *mana_get_primary_netdev(struct mana_context *ac, u32 port_index, netdevice_tracker *tracker) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 19214ec81fbb..8d955c25aed3 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -1004,40 +1004,12 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, return type; } -/** - * pci_p2pdma_map_segment - map an sg segment determining the mapping type - * @state: State structure that should be declared outside of the for_each_sg() - * loop and initialized to zero. - * @dev: DMA device that's doing the mapping operation - * @sg: scatterlist segment to map - * - * This is a helper to be used by non-IOMMU dma_map_sg() implementations where - * the sg segment is the same for the page_link and the dma_address. - * - * Attempt to map a single segment in an SGL with the PCI bus address. - * The segment must point to a PCI P2PDMA page and thus must be - * wrapped in a is_pci_p2pdma_page(sg_page(sg)) check. - * - * Returns the type of mapping used and maps the page if the type is - * PCI_P2PDMA_MAP_BUS_ADDR. - */ -enum pci_p2pdma_map_type -pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev, - struct scatterlist *sg) +void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, + struct device *dev, struct page *page) { - if (state->pgmap != page_pgmap(sg_page(sg))) { - state->pgmap = page_pgmap(sg_page(sg)); - state->map = pci_p2pdma_map_type(state->pgmap, dev); - state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; - } - - if (state->map == PCI_P2PDMA_MAP_BUS_ADDR) { - sg->dma_address = sg_phys(sg) + state->bus_off; - sg_dma_len(sg) = sg->length; - sg_dma_mark_bus_address(sg); - } - - return state->map; + state->pgmap = page_pgmap(page); + state->map = pci_p2pdma_map_type(state->pgmap, dev); + state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; } /** diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index e172522cd936..f48e5fb88bd5 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -434,58 +434,4 @@ static inline void debug_dma_dump_mappings(struct device *dev) #endif /* CONFIG_DMA_API_DEBUG */ extern const struct dma_map_ops dma_dummy_ops; - -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally for indicating the mapping - * type hasn't been calculated yet. Functions that return this enum - * never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - -struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; - int map; - u64 bus_off; -}; - -#ifdef CONFIG_PCI_P2PDMA -enum pci_p2pdma_map_type -pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev, - struct scatterlist *sg); -#else /* CONFIG_PCI_P2PDMA */ -static inline enum pci_p2pdma_map_type -pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev, - struct scatterlist *sg) -{ - return PCI_P2PDMA_MAP_NOT_SUPPORTED; -} -#endif /* CONFIG_PCI_P2PDMA */ - #endif /* _LINUX_DMA_MAP_OPS_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 85ab710ec0e7..55c03e5fe8cb 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -72,6 +72,22 @@ #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) +struct dma_iova_state { + dma_addr_t addr; + u64 __size; +}; + +/* + * Use the high bit to mark if we used swiotlb for one or more ranges. + */ +#define DMA_IOVA_USE_SWIOTLB (1ULL << 63) + +static inline size_t dma_iova_size(struct dma_iova_state *state) +{ + /* Casting is needed for 32-bits systems */ + return (size_t)(state->__size & ~DMA_IOVA_USE_SWIOTLB); +} + #ifdef CONFIG_DMA_API_DEBUG void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); void debug_dma_map_single(struct device *dev, const void *addr, @@ -277,6 +293,70 @@ static inline int dma_mmap_noncontiguous(struct device *dev, } #endif /* CONFIG_HAS_DMA */ +#ifdef CONFIG_IOMMU_DMA +/** + * dma_use_iova - check if the IOVA API is used for this state + * @state: IOVA state + * + * Return %true if the DMA transfers uses the dma_iova_*() calls or %false if + * they can't be used. + */ +static inline bool dma_use_iova(struct dma_iova_state *state) +{ + return state->__size != 0; +} + +bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t size); +void dma_iova_free(struct device *dev, struct dma_iova_state *state); +void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, + size_t mapped_len, enum dma_data_direction dir, + unsigned long attrs); +int dma_iova_sync(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size); +int dma_iova_link(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void dma_iova_unlink(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); +#else /* CONFIG_IOMMU_DMA */ +static inline bool dma_use_iova(struct dma_iova_state *state) +{ + return false; +} +static inline bool dma_iova_try_alloc(struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t size) +{ + return false; +} +static inline void dma_iova_free(struct device *dev, + struct dma_iova_state *state) +{ +} +static inline void dma_iova_destroy(struct device *dev, + struct dma_iova_state *state, size_t mapped_len, + enum dma_data_direction dir, unsigned long attrs) +{ +} +static inline int dma_iova_sync(struct device *dev, + struct dma_iova_state *state, size_t offset, size_t size) +{ + return -EOPNOTSUPP; +} +static inline int dma_iova_link(struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t offset, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + return -EOPNOTSUPP; +} +static inline void dma_iova_unlink(struct device *dev, + struct dma_iova_state *state, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ +} +#endif /* CONFIG_IOMMU_DMA */ + #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); @@ -326,6 +406,7 @@ static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false; } +bool dma_need_unmap(struct device *dev); #else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ static inline bool dma_dev_need_sync(const struct device *dev) { @@ -351,6 +432,10 @@ static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return false; } +static inline bool dma_need_unmap(struct device *dev) +{ + return false; +} #endif /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ struct page *dma_alloc_pages(struct device *dev, size_t size, diff --git a/include/linux/hmm-dma.h b/include/linux/hmm-dma.h new file mode 100644 index 000000000000..f58b9fc71999 --- /dev/null +++ b/include/linux/hmm-dma.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */ +#ifndef LINUX_HMM_DMA_H +#define LINUX_HMM_DMA_H + +#include <linux/dma-mapping.h> + +struct dma_iova_state; +struct pci_p2pdma_map_state; + +/* + * struct hmm_dma_map - array of PFNs and DMA addresses + * + * @state: DMA IOVA state + * @pfns: array of PFNs + * @dma_list: array of DMA addresses + * @dma_entry_size: size of each DMA entry in the array + */ +struct hmm_dma_map { + struct dma_iova_state state; + unsigned long *pfn_list; + dma_addr_t *dma_list; + size_t dma_entry_size; +}; + +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size); +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map); +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state); +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx); +#endif /* LINUX_HMM_DMA_H */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 126a36571667..db75ffc949a7 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,10 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_P2PDMA - P2P page + * HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer + * HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation + * to mark that page is already DMA mapped * * On input: * 0 - Return the current state of the page, do not fault it. @@ -36,13 +40,21 @@ enum hmm_pfn_flags { HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), + /* + * Sticky flags, carried from input to output, + * don't forget to update HMM_PFN_INOUT_FLAGS + */ + HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4), + HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5), + HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6), + + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11), /* Input flags */ HMM_PFN_REQ_FAULT = HMM_PFN_VALID, HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, - HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT, + HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1), }; /* @@ -58,6 +70,14 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) } /* + * hmm_pfn_to_phys() - return physical address pointed to by a device entry + */ +static inline phys_addr_t hmm_pfn_to_phys(unsigned long hmm_pfn) +{ + return __pfn_to_phys(hmm_pfn & ~HMM_PFN_FLAGS); +} + +/* * hmm_pfn_to_map_order() - return the CPU mapping size order * * This is optionally useful to optimize processing of the pfn result diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3a8d35d41fda..15cdadace993 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -872,6 +872,10 @@ extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp); +int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size); extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size); extern size_t iommu_unmap_fast(struct iommu_domain *domain, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d1dfbad9a447..e6ba8f4f4bd1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -398,6 +398,7 @@ struct mlx5_core_rsc_common { enum mlx5_res_type res; refcount_t refcount; struct completion free; + bool invalid; }; struct mlx5_uars_page { diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h deleted file mode 100644 index 13274c3def66..000000000000 --- a/include/linux/net/intel/iidc.h +++ /dev/null @@ -1,109 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ - -#ifndef _IIDC_H_ -#define _IIDC_H_ - -#include <linux/auxiliary_bus.h> -#include <linux/dcbnl.h> -#include <linux/device.h> -#include <linux/if_ether.h> -#include <linux/kernel.h> -#include <linux/netdevice.h> - -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ -}; - -enum iidc_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, -}; - -enum iidc_rdma_protocol { - IIDC_RDMA_PROTOCOL_IWARP = BIT(0), - IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), -}; - -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; -}; - -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); - u32 reg; -}; - -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - -/* Structure representing auxiliary driver tailored information about the core - * PCI dev, each auxiliary driver using the IIDC interface will have an - * instance of this struct dedicated to it. - */ - -struct iidc_auxiliary_dev { - struct auxiliary_device adev; - struct ice_pf *pf; -}; - -/* structure representing the auxiliary driver. This struct is to be - * allocated and populated by the auxiliary driver's owner. The core PCI - * driver will access these ops by performing a container_of on the - * auxiliary_device->dev.driver. - */ -struct iidc_auxiliary_drv { - struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); -}; - -#endif /* _IIDC_H_*/ diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h new file mode 100644 index 000000000000..8baad1082042 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2025, Intel Corporation. */ + +#ifndef _IIDC_RDMA_H_ +#define _IIDC_RDMA_H_ + +#include <linux/auxiliary_bus.h> +#include <linux/device.h> +#include <linux/if_ether.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <net/dscp.h> + +enum iidc_rdma_event_type { + IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, + IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, + IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, + IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_WARN_RESET, + IIDC_RDMA_EVENT_CRIT_ERR, + IIDC_RDMA_EVENT_NBITS /* must be last */ +}; + +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); + u32 reg; +}; + +enum iidc_rdma_reset_type { + IIDC_FUNC_RESET, + IIDC_DEV_RESET, +}; + +enum iidc_rdma_protocol { + IIDC_RDMA_PROTOCOL_IWARP = BIT(0), + IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), +}; + +/* Structure to be populated by core LAN PCI driver */ +struct iidc_rdma_core_dev_info { + struct pci_dev *pdev; /* PCI device of corresponding to main function */ + struct auxiliary_device *adev; + /* Current active RDMA protocol */ + enum iidc_rdma_protocol rdma_protocol; + void *iidc_priv; /* elements unique to each driver */ +}; + +/* Structure representing auxiliary driver tailored information about the core + * PCI dev, each auxiliary driver using the IIDC interface will have an + * instance of this struct dedicated to it. + */ +struct iidc_rdma_core_auxiliary_dev { + struct auxiliary_device adev; + struct iidc_rdma_core_dev_info *cdev_info; +}; + +/* structure representing the auxiliary driver. This struct is to be + * allocated and populated by the auxiliary driver's owner. The core PCI + * driver will access these ops by performing a container_of on the + * auxiliary_device->dev.driver. + */ +struct iidc_rdma_core_auxiliary_drv { + struct auxiliary_driver adrv; + void (*event_handler)(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_event *event); +}; + +#endif /* _IIDC_RDMA_H_*/ diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h new file mode 100644 index 000000000000..b40eed0e13fe --- /dev/null +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2025, Intel Corporation. */ + +#ifndef _IIDC_RDMA_ICE_H_ +#define _IIDC_RDMA_ICE_H_ + +#include <linux/dcbnl.h> + +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_DSCP_PFC_MODE 0x1 + +/** + * struct iidc_rdma_qset_params - Struct to hold per RDMA Qset info + * @teid: TEID of the Qset node + * @qs_handle: SW index of the Qset, RDMA provides this + * @vport_id: VSI index + * @tc: Traffic Class branch the QSet should belong to + */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; + u16 qs_handle; + u16 vport_id; + u8 tc; +}; + +struct iidc_rdma_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[DSCP_MAX]; +}; + +struct iidc_rdma_priv_dev_info { + u8 pf_id; + u16 vport_id; + struct net_device *netdev; + struct iidc_rdma_qos_params qos_info; + u8 __iomem *hw_addr; +}; + +int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev, + enum iidc_rdma_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, u16 vsi_id, + bool enable); +int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); +void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); + +#endif /* _IIDC_RDMA_ICE_H_*/ diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 2c07aa6b7665..075c20b161d9 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -104,4 +104,89 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + +struct pci_p2pdma_map_state { + struct dev_pagemap *pgmap; + enum pci_p2pdma_map_type map; + u64 bus_off; +}; + +/* helper for pci_p2pdma_state(), do not use directly */ +void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, + struct device *dev, struct page *page); + +/** + * pci_p2pdma_state - check the P2P transfer state of a page + * @state: P2P state structure + * @dev: device to transfer to/from + * @page: page to map + * + * Check if @page is a PCI P2PDMA page, and if yes of what kind. Returns the + * map type, and updates @state with all information needed for a P2P transfer. + */ +static inline enum pci_p2pdma_map_type +pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, + struct page *page) +{ + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { + if (state->pgmap != page_pgmap(page)) + __pci_p2pdma_update_state(state, dev, page); + return state->map; + } + return PCI_P2PDMA_MAP_NONE; +} + +/** + * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address + * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. + * @state: P2P state structure + * @paddr: physical address to map + * + * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. + */ +static inline dma_addr_t +pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +{ + WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); + return paddr + state->bus_off; +} + #endif /* _LINUX_PCI_P2P_H */ diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 228603bf03f2..3ce56a816425 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -60,6 +60,7 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_DONE = 131, GDMA_EQE_HWC_SOC_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_HWC_SOC_SERVICE = 134, GDMA_EQE_RNIC_QP_FATAL = 176, }; @@ -70,6 +71,18 @@ enum { GDMA_DEVICE_MANA_IB = 3, }; +enum gdma_service_type { + GDMA_SERVICE_TYPE_NONE = 0, + GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1, + GDMA_SERVICE_TYPE_RDMA_RESUME = 2, +}; + +struct mana_service_work { + struct work_struct work; + struct gdma_dev *gdma_dev; + enum gdma_service_type event; +}; + struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -224,6 +237,8 @@ struct gdma_dev { void *driver_data; struct auxiliary_device *adev; + bool is_suspended; + bool rdma_teardown; }; /* MANA_PAGE_SIZE is the DMA unit */ @@ -407,6 +422,10 @@ struct gdma_context { /* Azure RDMA adapter */ struct gdma_dev mana_ib; + + u64 pf_cap_flags1; + + struct workqueue_struct *service_wq; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) @@ -553,6 +572,7 @@ enum { */ #define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2) #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) +#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4) #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) /* Driver can handle holes (zeros) in the device list */ @@ -707,20 +727,6 @@ struct gdma_query_hwc_timeout_resp { u32 reserved; }; -enum atb_page_size { - ATB_PAGE_SIZE_4K, - ATB_PAGE_SIZE_8K, - ATB_PAGE_SIZE_16K, - ATB_PAGE_SIZE_32K, - ATB_PAGE_SIZE_64K, - ATB_PAGE_SIZE_128K, - ATB_PAGE_SIZE_256K, - ATB_PAGE_SIZE_512K, - ATB_PAGE_SIZE_1M, - ATB_PAGE_SIZE_2M, - ATB_PAGE_SIZE_MAX, -}; - enum gdma_mr_access_flags { GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0), GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1), @@ -815,6 +821,8 @@ enum gdma_mr_type { * address that is set up in the MST */ GDMA_MR_TYPE_GVA = 2, + /* Guest zero-based address MRs */ + GDMA_MR_TYPE_ZBVA = 4, }; struct gdma_create_mr_params { @@ -826,6 +834,10 @@ struct gdma_create_mr_params { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; }; }; @@ -841,7 +853,10 @@ struct gdma_create_mr_request { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; - + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; }; u32 reserved_2; };/* HW DATA */ @@ -893,4 +908,6 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); void mana_register_debugfs(void); void mana_unregister_debugfs(void); +int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event); + #endif /* _GDMA_H */ diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h index 158b125692c2..83cf93338eb3 100644 --- a/include/net/mana/hw_channel.h +++ b/include/net/mana/hw_channel.h @@ -49,6 +49,15 @@ union hwc_init_type_data { }; }; /* HW DATA */ +union hwc_init_soc_service_type { + u32 as_uint32; + + struct { + u32 value : 28; + u32 type : 4; + }; +}; /* HW DATA */ + struct hwc_rx_oob { u32 type : 6; u32 eom : 1; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0f78065de8fe..5857efc885a6 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -488,6 +488,9 @@ int mana_detach(struct net_device *ndev, bool from_close); int mana_probe(struct gdma_dev *gd, bool resuming); void mana_remove(struct gdma_dev *gd, bool suspending); +int mana_rdma_probe(struct gdma_dev *gd); +void mana_rdma_remove(struct gdma_dev *gd); + void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev); int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, u32 flags); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index a2ac62b4a6cf..1fa3786f82f4 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -480,23 +480,12 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); -#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */ - /** - * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection - * message. + * ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a + connection message in case duplicates are received. * @cm_id: Connection identifier associated with the connection message. - * @service_timeout: The lower 5-bits specify the maximum time required for - * the sender to reply to the connection message. The upper 3-bits - * specify additional control flags. - * @private_data: Optional user-defined private data sent with the - * message receipt acknowledgement. - * @private_data_len: Size of the private data buffer, in bytes. */ -int ib_send_cm_mra(struct ib_cm_id *cm_id, - u8 service_timeout, - const void *private_data, - u8 private_data_len); +int ib_prepare_cm_mra(struct ib_cm_id *cm_id); /** * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0844c1d05ac6..2a24bf791c10 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -8,23 +8,17 @@ #include <rdma/ib_umem.h> #include <rdma/ib_verbs.h> +#include <linux/hmm-dma.h> struct ib_umem_odp { struct ib_umem umem; struct mmu_interval_notifier notifier; struct pid *tgid; - /* An array of the pfns included in the on-demand paging umem. */ - unsigned long *pfn_list; + struct hmm_dma_map map; /* - * An array with DMA addresses mapped for pfns in pfn_list. - * The lower two bits designate access permissions. - * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. - */ - dma_addr_t *dma_list; - /* - * The umem_mutex protects the page_list and dma_list fields of an ODP + * The umem_mutex protects the page_list field of an ODP * umem, allowing only a single thread to map/unmap pages. The mutex * also protects access to the mmu notifier counters. */ @@ -67,19 +61,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) umem_odp->page_shift; } -/* - * The lower 2 bits of the DMA address signal the R/W permissions for - * the entry. To upgrade the permissions, provide the appropriate - * bitmask to the map_dma_pages function. - * - * Be aware that upgrading a mapped address might result in change of - * the DMA address for the page. - */ -#define ODP_READ_ALLOWED_BIT (1<<0ULL) -#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) - -#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) - #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_umem_odp * diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 901353796fbb..af43a8d2a74a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -314,17 +314,19 @@ enum ib_atomic_cap { }; enum ib_odp_general_cap_bits { - IB_ODP_SUPPORT = 1 << 0, - IB_ODP_SUPPORT_IMPLICIT = 1 << 1, + IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT, + IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT, }; enum ib_odp_transport_cap_bits { - IB_ODP_SUPPORT_SEND = 1 << 0, - IB_ODP_SUPPORT_RECV = 1 << 1, - IB_ODP_SUPPORT_WRITE = 1 << 2, - IB_ODP_SUPPORT_READ = 1 << 3, - IB_ODP_SUPPORT_ATOMIC = 1 << 4, - IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, + IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND, + IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV, + IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE, + IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ, + IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC, + IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV, + IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH, + IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE, }; struct ib_odp_caps { diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 8a8ab2f793ab..d1593ad47e28 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -388,6 +388,5 @@ void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid); struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id); -struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res); #endif /* RDMA_CM_H */ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index e16650f0c85d..3b7bd99813e9 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -233,6 +233,22 @@ struct ib_uverbs_ex_query_device { __u32 reserved; }; +enum ib_uverbs_odp_general_cap_bits { + IB_UVERBS_ODP_SUPPORT = 1 << 0, + IB_UVERBS_ODP_SUPPORT_IMPLICIT = 1 << 1, +}; + +enum ib_uverbs_odp_transport_cap_bits { + IB_UVERBS_ODP_SUPPORT_SEND = 1 << 0, + IB_UVERBS_ODP_SUPPORT_RECV = 1 << 1, + IB_UVERBS_ODP_SUPPORT_WRITE = 1 << 2, + IB_UVERBS_ODP_SUPPORT_READ = 1 << 3, + IB_UVERBS_ODP_SUPPORT_ATOMIC = 1 << 4, + IB_UVERBS_ODP_SUPPORT_SRQ_RECV = 1 << 5, + IB_UVERBS_ODP_SUPPORT_FLUSH = 1 << 6, + IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7, +}; + struct ib_uverbs_odp_caps { __aligned_u64 general_caps; struct { diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b8fe0b3d0ffb..24c359d9c879 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -13,6 +13,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> #include "direct.h" /* @@ -462,34 +463,33 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_map_state p2pdma_state = {}; - enum pci_p2pdma_map_type map; struct scatterlist *sg; int i, ret; for_each_sg(sgl, sg, nents, i) { - if (is_pci_p2pdma_page(sg_page(sg))) { - map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); - switch (map) { - case PCI_P2PDMA_MAP_BUS_ADDR: - continue; - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * Any P2P mapping that traverses the PCI - * host bridge must be mapped with CPU physical - * address and not PCI bus addresses. This is - * done with dma_direct_map_page() below. - */ - break; - default: - ret = -EREMOTEIO; + switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI host bridge + * must be mapped with CPU physical address and not PCI + * bus addresses. + */ + break; + case PCI_P2PDMA_MAP_NONE: + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), + sg->offset, sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; } - } - - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) { - ret = -EIO; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, + sg_phys(sg)); + sg_dma_mark_bus_address(sg); + continue; + default: + ret = -EREMOTEIO; goto out_unmap; } sg_dma_len(sg) = sg->length; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 051a32988040..107e4a4d251d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -443,6 +443,24 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL_GPL(__dma_need_sync); +/** + * dma_need_unmap - does this device need dma_unmap_* operations + * @dev: device to check + * + * If this function returns %false, drivers can skip calling dma_unmap_* after + * finishing an I/O. This function must be called after all mappings that might + * need to be unmapped have been performed. + */ +bool dma_need_unmap(struct device *dev) +{ + if (!dma_map_direct(dev, get_dma_ops(dev))) + return true; + if (!dev->dma_skip_sync) + return true; + return IS_ENABLED(CONFIG_DMA_API_DEBUG); +} +EXPORT_SYMBOL_GPL(dma_need_unmap); + static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -10,6 +10,7 @@ */ #include <linux/pagewalk.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/init.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -23,6 +24,7 @@ #include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> +#include <linux/pci-p2pdma.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -39,13 +41,21 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; +enum { + /* These flags are carried from input-to-output */ + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | + HMM_PFN_P2PDMA_BUS, +}; + static int hmm_pfns_fill(unsigned long addr, unsigned long end, struct hmm_range *range, unsigned long cpu_flags) { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long cpu_flags; pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; + uint64_t new_pfn_flags = 0; if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; - return 0; + goto out; } if (!pte_present(pte)) { @@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; - return 0; + new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + goto out; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); - if (!required_fault) { - *hmm_pfn = 0; - return 0; - } + if (!required_fault) + goto out; if (!non_swap_entry(entry)) goto fault; @@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; - return 0; + new_pfn_flags = HMM_PFN_ERROR; + goto out; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + new_pfn_flags = pte_pfn(pte) | cpu_flags; +out: + *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags; return 0; fault: @@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; @@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range) return ret; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_dma_map_alloc - Allocate HMM map structure + * @dev: device to allocate structure for + * @map: HMM map to allocate + * @nr_entries: number of entries in the map + * @dma_entry_size: size of the DMA entry in the map + * + * Allocate the HMM map structure and all the lists it contains. + * Return 0 on success, -ENOMEM on failure. + */ +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size) +{ + bool dma_need_sync = false; + bool use_iova; + + WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size)); + + /* + * The HMM API violates our normal DMA buffer ownership rules and can't + * transfer buffer ownership. The dma_addressing_limited() check is a + * best approximation to ensure no swiotlb buffering happens. + */ +#ifdef CONFIG_DMA_NEED_SYNC + dma_need_sync = !dev->dma_skip_sync; +#endif /* CONFIG_DMA_NEED_SYNC */ + if (dma_need_sync || dma_addressing_limited(dev)) + return -EOPNOTSUPP; + + map->dma_entry_size = dma_entry_size; + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) + return -ENOMEM; + + use_iova = dma_iova_try_alloc(dev, &map->state, 0, + nr_entries * PAGE_SIZE); + if (!use_iova && dma_need_unmap(dev)) { + map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->dma_list) + goto err_dma; + } + return 0; + +err_dma: + kvfree(map->pfn_list); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(hmm_dma_map_alloc); + +/** + * hmm_dma_map_free - iFree HMM map structure + * @dev: device to free structure from + * @map: HMM map containing the various lists and state + * + * Free the HMM map structure and all the lists it contains. + */ +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map) +{ + if (dma_use_iova(&map->state)) + dma_iova_free(dev, &map->state); + kvfree(map->pfn_list); + kvfree(map->dma_list); +} +EXPORT_SYMBOL_GPL(hmm_dma_map_free); + +/** + * hmm_dma_map_pfn - Map a physical HMM page to DMA address + * @dev: Device to map the page for + * @map: HMM map + * @idx: Index into the PFN and dma address arrays + * @p2pdma_state: PCI P2P state. + * + * dma_alloc_iova() allocates IOVA based on the size specified by their use in + * iova->size. Call this function after IOVA allocation to link whole @page + * to get the DMA address. Note that very first call to this function + * will have @offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state) +{ + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + struct page *page = hmm_pfn_to_page(pfns[idx]); + phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); + size_t offset = idx * map->dma_entry_size; + unsigned long attrs = 0; + dma_addr_t dma_addr; + int ret; + + if ((pfns[idx] & HMM_PFN_DMA_MAPPED) && + !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) { + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + */ + if (dma_use_iova(state)) + return state->addr + offset; + + /* + * Without dma_need_unmap, the dma_addrs array is NULL, thus we + * need to regenerate the address below even if there already + * was a mapping. But !dma_need_unmap implies that the + * mapping stateless, so this is fine. + */ + if (dma_need_unmap(dev)) + return dma_addrs[idx]; + + /* Continue to remapping */ + } + + switch (pci_p2pdma_state(p2pdma_state, dev, page)) { + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + pfns[idx] |= HMM_PFN_P2PDMA; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; + return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + default: + return DMA_MAPPING_ERROR; + } + + if (dma_use_iova(state)) { + ret = dma_iova_link(dev, state, paddr, offset, + map->dma_entry_size, DMA_BIDIRECTIONAL, + attrs); + if (ret) + goto error; + + ret = dma_iova_sync(dev, state, offset, map->dma_entry_size); + if (ret) { + dma_iova_unlink(dev, state, offset, map->dma_entry_size, + DMA_BIDIRECTIONAL, attrs); + goto error; + } + + dma_addr = state->addr + offset; + } else { + if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs)) + goto error; + + dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, dma_addr)) + goto error; + + if (dma_need_unmap(dev)) + dma_addrs[idx] = dma_addr; + } + pfns[idx] |= HMM_PFN_DMA_MAPPED; + return dma_addr; +error: + pfns[idx] &= ~HMM_PFN_P2PDMA; + return DMA_MAPPING_ERROR; + +} +EXPORT_SYMBOL_GPL(hmm_dma_map_pfn); + +/** + * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address + * @dev: Device to unmap the page from + * @map: HMM map + * @idx: Index of the PFN to unmap + * + * Returns true if the PFN was mapped and has been unmapped, false otherwise. + */ +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) +{ + const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED; + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + unsigned long attrs = 0; + + if ((pfns[idx] & valid_dma) != valid_dma) + return false; + + if (pfns[idx] & HMM_PFN_P2PDMA_BUS) + ; /* no need to unmap bus address P2P mappings */ + else if (dma_use_iova(state)) { + if (pfns[idx] & HMM_PFN_P2PDMA) + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + dma_iova_unlink(dev, state, idx * map->dma_entry_size, + map->dma_entry_size, DMA_BIDIRECTIONAL, attrs); + } else if (dma_need_unmap(dev)) + dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size, + DMA_BIDIRECTIONAL); + + pfns[idx] &= + ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS); + return true; +} +EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn); |