From a9bb79128aa659f97b774b97c9bb1bdc74444595 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Mon, 9 May 2011 22:06:10 -0700 Subject: RDMA/cma: Add an ID_REUSEADDR option Lustre requires that clients bind to a privileged port number before connecting to a remote server. On larger clusters (typically more than about 1000 nodes), the number of privileged ports is exhausted, resulting in lustre being unusable. To handle this, we add support for reusable addresses to the rdma_cm. This mimics the behavior of the socket option SO_REUSEADDR. A user may set an rdma_cm_id to reuse an address before calling rdma_bind_addr() (explicitly or implicitly). If set, other rdma_cm_id's may be bound to the same address, provided that they all have reuse enabled, and there are no active listens. If rdma_listen() is called on an rdma_cm_id that has reuse enabled, it will only succeed if there are no other id's bound to that same address. The reuse option is exported to user space. The behavior of the kernel reuse implementation was verified against that given by sockets. This patch is derived from a path by Ira Weiny Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/ucma.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/infiniband/core/ucma.c') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ec1e9da1488b..b3fa798525b2 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -883,6 +883,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; + case RDMA_OPTION_ID_REUSEADDR: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; default: ret = -ENOSYS; } -- cgit v1.2.3 From 04ea2f81973f55db715bfdac7dd258f8a8485a6d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 23 May 2011 10:48:43 -0700 Subject: RDMA/ucma: Add .nodename/.mode to tell userspace where to create device node We want udev to create a device node under /dev/infiniband with permission 0666 for rdma_cm, so add that info to our struct miscdevice. Signed-off-by: Roland Dreier Acked-by: Sean Hefty --- drivers/infiniband/core/ucma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/core/ucma.c') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index b3fa798525b2..3170899dab01 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1338,9 +1338,11 @@ static const struct file_operations ucma_fops = { }; static struct miscdevice ucma_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "rdma_cm", - .fops = &ucma_fops, + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", + .nodename = "infiniband/rdma_cm", + .mode = 0666, + .fops = &ucma_fops, }; static ssize_t show_abi_version(struct device *dev, -- cgit v1.2.3 From b26f9b9949013fec31b23c426fc463164ae08891 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 1 Apr 2010 17:08:41 +0000 Subject: RDMA/cma: Pass QP type into rdma_create_id() The RDMA CM currently infers the QP type from the port space selected by the user. In the future (eg with RDMA_PS_IB or XRC), there may not be a 1-1 correspondence between port space and QP type. For netlink export of RDMA CM state, we want to export the QP type to userspace, so it is cleaner to explicitly associate a QP type to an ID. Modify rdma_create_id() to allow the user to specify the QP type, and use it to make our selections of datagram versus connected mode. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 35 +++++++++++++++----------------- drivers/infiniband/core/ucma.c | 27 ++++++++++++++++++++---- drivers/infiniband/ulp/iser/iser_verbs.c | 2 +- include/rdma/rdma_cm.h | 5 ++++- net/9p/trans_rdma.c | 3 ++- net/rds/ib.c | 2 +- net/rds/ib_cm.c | 2 +- net/rds/iw.c | 2 +- net/rds/iw_cm.c | 2 +- net/rds/rdma_transport.c | 3 ++- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 2 +- 12 files changed, 55 insertions(+), 33 deletions(-) (limited to 'drivers/infiniband/core/ucma.c') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 451d39e19cb4..44be1c9ed05b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -265,11 +265,6 @@ static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); } -static inline int cma_is_ud_ps(enum rdma_port_space ps) -{ - return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB); -} - static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -415,7 +410,8 @@ static int cma_has_cm_dev(struct rdma_id_private *id_priv) } struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps) + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type) { struct rdma_id_private *id_priv; @@ -427,6 +423,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; + id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); @@ -494,7 +491,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, if (IS_ERR(qp)) return PTR_ERR(qp); - if (cma_is_ud_ps(id_priv->id.ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); @@ -622,7 +619,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; - if (cma_is_ud_ps(id_priv->id.ps)) { + if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_qkey(id_priv); if (ret) return ret; @@ -645,7 +642,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: - if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps)) + if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, @@ -1088,7 +1085,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, goto err; id = rdma_create_id(listen_id->event_handler, listen_id->context, - listen_id->ps); + listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) goto err; @@ -1139,7 +1136,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, - listen_id->ps); + listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; @@ -1194,7 +1191,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; - if (cma_is_ud_ps(listen_id->id.ps)) { + if (listen_id->id.qp_type == IB_QPT_UD) { conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = @@ -1230,8 +1227,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) * while we're accessing the cm_id. */ mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && - !cma_is_ud_ps(conn_id->id.ps)) + if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); @@ -1386,7 +1382,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; @@ -1535,7 +1531,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct rdma_cm_id *id; int ret; - id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps); + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, + id_priv->id.qp_type); if (IS_ERR(id)) return; @@ -2645,7 +2642,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (cma_is_ud_ps(id->ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); @@ -2758,7 +2755,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (cma_is_ud_ps(id->ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->private_data, conn_param->private_data_len); @@ -2819,7 +2816,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (cma_is_ud_ps(id->ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, private_data, private_data_len); else diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index b3fa798525b2..7109d5d23ba5 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -367,13 +367,28 @@ done: return ret; } -static ssize_t ucma_create_id(struct ucma_file *file, - const char __user *inbuf, - int in_len, int out_len) +static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) +{ + switch (cmd->ps) { + case RDMA_PS_TCP: + *qp_type = IB_QPT_RC; + return 0; + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + *qp_type = IB_QPT_UD; + return 0; + default: + return -EINVAL; + } +} + +static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; + enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) @@ -382,6 +397,10 @@ static ssize_t ucma_create_id(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + ret = ucma_get_qp_type(&cmd, &qp_type); + if (ret) + return ret; + mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); mutex_unlock(&file->mut); @@ -389,7 +408,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps); + ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 9876865732f7..ede1475bee09 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -548,7 +548,7 @@ int iser_connect(struct iser_conn *ib_conn, iser_conn_get(ib_conn); /* ref ib conn's cma id */ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)ib_conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ib_conn->cma_id)) { err = PTR_ERR(ib_conn->cma_id); iser_err("rdma_create_id failed: %d\n", err); diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index d5b2265b7ce8..26977c149c41 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -144,6 +144,7 @@ struct rdma_cm_id { rdma_cm_event_handler event_handler; struct rdma_route route; enum rdma_port_space ps; + enum ib_qp_type qp_type; u8 port_num; }; @@ -154,9 +155,11 @@ struct rdma_cm_id { * returned rdma_id. * @context: User specified context associated with the id. * @ps: RDMA port space. + * @qp_type: type of queue pair associated with the id. */ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps); + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type); /** * rdma_destroy_id - Destroys an RDMA identifier. diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 150e0c4bbf40..3640e83eef8f 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -591,7 +591,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) return -ENOMEM; /* Create the RDMA CM ID */ - rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); + rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; diff --git a/net/rds/ib.c b/net/rds/ib.c index cce19f95c624..3b83086bcc30 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -325,7 +325,7 @@ static int rds_ib_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index ee369d201a65..fd453dd5124b 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -587,7 +587,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; diff --git a/net/rds/iw.c b/net/rds/iw.c index 5a9676fe594f..f7474844f096 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -226,7 +226,7 @@ static int rds_iw_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 3a60a15d1b4a..c12db66f24c7 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -522,7 +522,7 @@ int rds_iw_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 4195a0539829..f8760e1b6688 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -158,7 +158,8 @@ static int rds_rdma_listen_init(void) struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); printk(KERN_ERR "RDS/RDMA: failed to setup listener, " diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 1a10dcd999ea..afff475642ca 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -695,7 +695,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; - listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); dprintk("svcrdma: rdma_create_id failed = %d\n", ret); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d4297dc43dc4..80f8da344df5 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -387,7 +387,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, init_completion(&ia->ri_done); - id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); dprintk("RPC: %s: rdma_create_id() failed %i\n", -- cgit v1.2.3