summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/client.c25
-rw-r--r--net/9p/trans_fd.c110
-rw-r--r--net/9p/trans_rdma.c26
-rw-r--r--net/9p/trans_virtio.c3
-rw-r--r--net/Kconfig2
-rw-r--r--net/atm/clip.c2
-rw-r--r--net/atm/lec.c10
-rw-r--r--net/atm/mpc.c6
-rw-r--r--net/atm/raw.c2
-rw-r--r--net/atm/signaling.c2
-rw-r--r--net/ax25/ax25_in.c2
-rw-r--r--net/bluetooth/l2cap_sock.c6
-rw-r--r--net/bluetooth/rfcomm/core.c4
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_vlan.c7
-rw-r--r--net/caif/caif_socket.c4
-rw-r--r--net/ceph/crush/mapper.c85
-rw-r--r--net/ceph/debugfs.c55
-rw-r--r--net/ceph/messenger.c8
-rw-r--r--net/ceph/osd_client.c41
-rw-r--r--net/ceph/osdmap.c993
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/filter.c9
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/netclassid_cgroup.c15
-rw-r--r--net/core/netprio_cgroup.c41
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/core/skbuff.c16
-rw-r--r--net/core/sock.c4
-rw-r--r--net/dccp/input.c2
-rw-r--r--net/dccp/minisocks.c2
-rw-r--r--net/decnet/dn_nsp_in.c4
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ping.c15
-rw-r--r--net/ipv4/route.c4
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_memcontrol.c4
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/iucv/af_iucv.c4
-rw-r--r--net/iucv/iucv.c127
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/l2tp/l2tp_ppp.c4
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/netrom/af_netrom.c2
-rw-r--r--net/nfc/llcp_core.c2
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/phonet/pep-gprs.c4
-rw-r--r--net/phonet/pep.c8
-rw-r--r--net/rds/tcp.h4
-rw-r--r--net/rds/tcp_listen.c6
-rw-r--r--net/rds/tcp_recv.c8
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/rxrpc/ar-input.c6
-rw-r--r--net/rxrpc/ar-internal.h2
-rw-r--r--net/sctp/associola.c82
-rw-r--r--net/sctp/sm_statefuns.c2
-rw-r--r--net/sctp/socket.c50
-rw-r--r--net/sctp/ulpevent.c8
-rw-r--r--net/sctp/ulpqueue.c4
-rw-r--r--net/sunrpc/Kconfig39
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/backchannel_rqst.c93
-rw-r--r--net/sunrpc/clnt.c81
-rw-r--r--net/sunrpc/sched.c3
-rw-r--r--net/sunrpc/svcsock.c28
-rw-r--r--net/sunrpc/xdr.c22
-rw-r--r--net/sunrpc/xprt.c12
-rw-r--r--net/sunrpc/xprtrdma/Makefile4
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c3
-rw-r--r--net/sunrpc/xprtrdma/transport.c10
-rw-r--r--net/sunrpc/xprtsock.c70
-rw-r--r--net/tipc/server.c4
-rw-r--r--net/tipc/socket.c6
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c2
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c4
-rw-r--r--net/x25/af_x25.c2
-rw-r--r--net/x25/x25_in.c2
87 files changed, 1498 insertions, 811 deletions
diff --git a/net/9p/client.c b/net/9p/client.c
index 9186550d77a6..0004cbaac4a4 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -415,9 +415,17 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
* req: request received
*
*/
-void p9_client_cb(struct p9_client *c, struct p9_req_t *req)
+void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
{
p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag);
+
+ /*
+ * This barrier is needed to make sure any change made to req before
+ * the other thread wakes up will indeed be seen by the waiting side.
+ */
+ smp_wmb();
+ req->status = status;
+
wake_up(req->wq);
p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
}
@@ -655,16 +663,13 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
if (IS_ERR(req))
return PTR_ERR(req);
-
/*
* if we haven't received a response for oldreq,
* remove it from the list
*/
- if (oldreq->status == REQ_STATUS_FLSH) {
- spin_lock(&c->lock);
- list_del(&oldreq->req_list);
- spin_unlock(&c->lock);
- }
+ if (oldreq->status == REQ_STATUS_SENT)
+ if (c->trans_mod->cancelled)
+ c->trans_mod->cancelled(c, oldreq);
p9_free_req(c, req);
return 0;
@@ -751,6 +756,12 @@ again:
err = wait_event_interruptible(*req->wq,
req->status >= REQ_STATUS_RCVD);
+ /*
+ * Make sure our req is coherent with regard to updates in other
+ * threads - echoes to wmb() in the callback
+ */
+ smp_rmb();
+
if ((err == -ERESTARTSYS) && (c->status == Connected)
&& (type == P9_TFLUSH)) {
sigpending = 1;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index b7bd7f2961bf..80d08f6664cb 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -66,20 +66,6 @@ struct p9_fd_opts {
int privport;
};
-/**
- * struct p9_trans_fd - transport state
- * @rd: reference to file to read from
- * @wr: reference of file to write to
- * @conn: connection state reference
- *
- */
-
-struct p9_trans_fd {
- struct file *rd;
- struct file *wr;
- struct p9_conn *conn;
-};
-
/*
* Option Parsing (code inspired by NFS code)
* - a little lazy - parse all fd-transport options
@@ -159,6 +145,20 @@ struct p9_conn {
unsigned long wsched;
};
+/**
+ * struct p9_trans_fd - transport state
+ * @rd: reference to file to read from
+ * @wr: reference of file to write to
+ * @conn: connection state reference
+ *
+ */
+
+struct p9_trans_fd {
+ struct file *rd;
+ struct file *wr;
+ struct p9_conn conn;
+};
+
static void p9_poll_workfn(struct work_struct *work);
static DEFINE_SPINLOCK(p9_poll_lock);
@@ -212,15 +212,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
m->err = err;
list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
- req->status = REQ_STATUS_ERROR;
- if (!req->t_err)
- req->t_err = err;
list_move(&req->req_list, &cancel_list);
}
list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
- req->status = REQ_STATUS_ERROR;
- if (!req->t_err)
- req->t_err = err;
list_move(&req->req_list, &cancel_list);
}
spin_unlock_irqrestore(&m->client->lock, flags);
@@ -228,7 +222,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
list_del(&req->req_list);
- p9_client_cb(m->client, req);
+ if (!req->t_err)
+ req->t_err = err;
+ p9_client_cb(m->client, req, REQ_STATUS_ERROR);
}
}
@@ -302,6 +298,7 @@ static void p9_read_work(struct work_struct *work)
{
int n, err;
struct p9_conn *m;
+ int status = REQ_STATUS_ERROR;
m = container_of(work, struct p9_conn, rq);
@@ -348,8 +345,7 @@ static void p9_read_work(struct work_struct *work)
"mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
m->req = p9_tag_lookup(m->client, tag);
- if (!m->req || (m->req->status != REQ_STATUS_SENT &&
- m->req->status != REQ_STATUS_FLSH)) {
+ if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
tag);
err = -EIO;
@@ -375,10 +371,10 @@ static void p9_read_work(struct work_struct *work)
p9_debug(P9_DEBUG_TRANS, "got new packet\n");
spin_lock(&m->client->lock);
if (m->req->status != REQ_STATUS_ERROR)
- m->req->status = REQ_STATUS_RCVD;
+ status = REQ_STATUS_RCVD;
list_del(&m->req->req_list);
spin_unlock(&m->client->lock);
- p9_client_cb(m->client, m->req);
+ p9_client_cb(m->client, m->req, status);
m->rbuf = NULL;
m->rpos = 0;
m->rsize = 0;
@@ -573,21 +569,19 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
}
/**
- * p9_conn_create - allocate and initialize the per-session mux data
+ * p9_conn_create - initialize the per-session mux data
* @client: client instance
*
* Note: Creates the polling task if this is the first session.
*/
-static struct p9_conn *p9_conn_create(struct p9_client *client)
+static void p9_conn_create(struct p9_client *client)
{
int n;
- struct p9_conn *m;
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize);
- m = kzalloc(sizeof(struct p9_conn), GFP_KERNEL);
- if (!m)
- return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&m->mux_list);
m->client = client;
@@ -609,8 +603,6 @@ static struct p9_conn *p9_conn_create(struct p9_client *client)
p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
set_bit(Wpending, &m->wsched);
}
-
- return m;
}
/**
@@ -669,7 +661,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
{
int n;
struct p9_trans_fd *ts = client->trans;
- struct p9_conn *m = ts->conn;
+ struct p9_conn *m = &ts->conn;
p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n",
m, current, req->tc, req->tc->id);
@@ -704,14 +696,26 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
list_del(&req->req_list);
req->status = REQ_STATUS_FLSHD;
ret = 0;
- } else if (req->status == REQ_STATUS_SENT)
- req->status = REQ_STATUS_FLSH;
-
+ }
spin_unlock(&client->lock);
return ret;
}
+static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ /* we haven't received a response for oldreq,
+ * remove it from the list.
+ */
+ spin_lock(&client->lock);
+ list_del(&req->req_list);
+ spin_unlock(&client->lock);
+
+ return 0;
+}
+
/**
* parse_opts - parse mount options into p9_fd_opts structure
* @params: options string passed from mount
@@ -780,7 +784,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
{
- struct p9_trans_fd *ts = kmalloc(sizeof(struct p9_trans_fd),
+ struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd),
GFP_KERNEL);
if (!ts)
return -ENOMEM;
@@ -806,9 +810,8 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
{
struct p9_trans_fd *p;
struct file *file;
- int ret;
- p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
+ p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
if (!p)
return -ENOMEM;
@@ -829,20 +832,12 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
p->rd->f_flags |= O_NONBLOCK;
- p->conn = p9_conn_create(client);
- if (IS_ERR(p->conn)) {
- ret = PTR_ERR(p->conn);
- p->conn = NULL;
- kfree(p);
- sockfd_put(csocket);
- sockfd_put(csocket);
- return ret;
- }
+ p9_conn_create(client);
return 0;
}
/**
- * p9_mux_destroy - cancels all pending requests and frees mux resources
+ * p9_mux_destroy - cancels all pending requests of mux
* @m: mux to destroy
*
*/
@@ -859,7 +854,6 @@ static void p9_conn_destroy(struct p9_conn *m)
p9_conn_cancel(m, -ECONNRESET);
m->client = NULL;
- kfree(m);
}
/**
@@ -881,7 +875,7 @@ static void p9_fd_close(struct p9_client *client)
client->status = Disconnected;
- p9_conn_destroy(ts->conn);
+ p9_conn_destroy(&ts->conn);
if (ts->rd)
fput(ts->rd);
@@ -1033,14 +1027,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
return err;
p = (struct p9_trans_fd *) client->trans;
- p->conn = p9_conn_create(client);
- if (IS_ERR(p->conn)) {
- err = PTR_ERR(p->conn);
- p->conn = NULL;
- fput(p->rd);
- fput(p->wr);
- return err;
- }
+ p9_conn_create(client);
return 0;
}
@@ -1053,6 +1040,7 @@ static struct p9_trans_module p9_tcp_trans = {
.close = p9_fd_close,
.request = p9_fd_request,
.cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
.owner = THIS_MODULE,
};
@@ -1064,6 +1052,7 @@ static struct p9_trans_module p9_unix_trans = {
.close = p9_fd_close,
.request = p9_fd_request,
.cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
.owner = THIS_MODULE,
};
@@ -1075,6 +1064,7 @@ static struct p9_trans_module p9_fd_trans = {
.close = p9_fd_close,
.request = p9_fd_request,
.cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
.owner = THIS_MODULE,
};
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 8f68df5d2973..14ad43b5cf89 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -193,6 +193,8 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
if (!*p)
continue;
token = match_token(p, tokens, args);
+ if (token == Opt_err)
+ continue;
r = match_int(&args[0], &option);
if (r < 0) {
p9_debug(P9_DEBUG_ERROR,
@@ -305,8 +307,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
}
req->rc = c->rc;
- req->status = REQ_STATUS_RCVD;
- p9_client_cb(client, req);
+ p9_client_cb(client, req, REQ_STATUS_RCVD);
return;
@@ -511,6 +512,11 @@ dont_need_post_recv:
goto send_error;
}
+ /* Mark request as `sent' *before* we actually send it,
+ * because doing if after could erase the REQ_STATUS_RCVD
+ * status in case of a very fast reply.
+ */
+ req->status = REQ_STATUS_SENT;
err = ib_post_send(rdma->qp, &wr, &bad_wr);
if (err)
goto send_error;
@@ -520,6 +526,7 @@ dont_need_post_recv:
/* Handle errors that happened during or while preparing the send: */
send_error:
+ req->status = REQ_STATUS_ERROR;
kfree(c);
p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
@@ -582,12 +589,24 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
return rdma;
}
-/* its not clear to me we can do anything after send has been posted */
static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
{
+ /* Nothing to do here.
+ * We will take care of it (if we have to) in rdma_cancelled()
+ */
return 1;
}
+/* A request has been fully flushed without a reply.
+ * That means we have posted one buffer in excess.
+ */
+static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ atomic_inc(&rdma->excess_rc);
+ return 0;
+}
+
/**
* trans_create_rdma - Transport method for creating atransport instance
* @client: client instance
@@ -721,6 +740,7 @@ static struct p9_trans_module p9_rdma_trans = {
.close = rdma_close,
.request = rdma_request,
.cancel = rdma_cancel,
+ .cancelled = rdma_cancelled,
};
/**
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index ac2666c1d011..6940d8fe8971 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -164,8 +164,7 @@ static void req_done(struct virtqueue *vq)
p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc);
p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
req = p9_tag_lookup(chan->client, rc->tag);
- req->status = REQ_STATUS_RCVD;
- p9_client_cb(chan->client, req);
+ p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
}
}
diff --git a/net/Kconfig b/net/Kconfig
index d1f6f968fc09..d92afe4204d9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -243,7 +243,7 @@ config XPS
default y
config CGROUP_NET_PRIO
- tristate "Network priority cgroup"
+ bool "Network priority cgroup"
depends on CGROUPS
---help---
Cgroup subsystem for use in assigning processes to network priorities on
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 8215f7cb170b..ba291ce4bdff 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -68,7 +68,7 @@ static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip)
sk = sk_atm(atmarpd);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 0;
}
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5a2f602d07e1..4c5b8ba0f84f 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -152,7 +152,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
atm_force_charge(priv->lecd, skb2->truesize);
sk = sk_atm(priv->lecd);
skb_queue_tail(&sk->sk_receive_queue, skb2);
- sk->sk_data_ready(sk, skb2->len);
+ sk->sk_data_ready(sk);
}
}
#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
@@ -447,7 +447,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
atm_force_charge(priv->lecd, skb2->truesize);
sk = sk_atm(priv->lecd);
skb_queue_tail(&sk->sk_receive_queue, skb2);
- sk->sk_data_ready(sk, skb2->len);
+ sk->sk_data_ready(sk);
}
}
#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
@@ -530,13 +530,13 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
atm_force_charge(priv->lecd, skb->truesize);
sk = sk_atm(priv->lecd);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
if (data != NULL) {
pr_debug("about to send %d bytes of data\n", data->len);
atm_force_charge(priv->lecd, data->truesize);
skb_queue_tail(&sk->sk_receive_queue, data);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
}
return 0;
@@ -616,7 +616,7 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
pr_debug("%s: To daemon\n", dev->name);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
} else { /* Data frame, queue to protocol handlers */
struct lec_arp_table *entry;
unsigned char *src, *dst;
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 91dc58f1124d..e8e0e7a8a23d 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -706,7 +706,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
dprintk("(%s) control packet arrived\n", dev->name);
/* Pass control packets to daemon */
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return;
}
@@ -992,7 +992,7 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc)
sk = sk_atm(mpc->mpoad_vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 0;
}
@@ -1273,7 +1273,7 @@ static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry)
sk = sk_atm(vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
dprintk("exiting\n");
}
diff --git a/net/atm/raw.c b/net/atm/raw.c
index b4f7b9ff3c74..2e17e97a7a8b 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -25,7 +25,7 @@ static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb)
struct sock *sk = sk_atm(vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
}
}
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 4176887e72eb..523bce72f698 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -51,7 +51,7 @@ static void sigd_put_skb(struct sk_buff *skb)
#endif
atm_force_charge(sigd, skb->truesize);
skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb);
- sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len);
+ sk_atm(sigd)->sk_data_ready(sk_atm(sigd));
}
static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 96f4cab3a2f9..7ed8ab724819 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -422,7 +422,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
if (sk) {
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
sock_put(sk);
} else {
free:
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index f59e00c2daa9..ef5e5b04f34f 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1271,7 +1271,7 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
if (parent) {
bt_accept_unlink(sk);
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
sk->sk_state_change(sk);
}
@@ -1327,7 +1327,7 @@ static void l2cap_sock_ready_cb(struct l2cap_chan *chan)
sk->sk_state_change(sk);
if (parent)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
release_sock(sk);
}
@@ -1340,7 +1340,7 @@ static void l2cap_sock_defer_cb(struct l2cap_chan *chan)
parent = bt_sk(sk)->parent;
if (parent)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
release_sock(sk);
}
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 633cceeb943e..cf620260affa 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -186,9 +186,9 @@ static void rfcomm_l2state_change(struct sock *sk)
rfcomm_schedule();
}
-static void rfcomm_l2data_ready(struct sock *sk, int bytes)
+static void rfcomm_l2data_ready(struct sock *sk)
{
- BT_DBG("%p bytes %d", sk, bytes);
+ BT_DBG("%p", sk);
rfcomm_schedule();
}
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index eabd25ab5ad9..c603a5eb4720 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -54,7 +54,7 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
atomic_add(skb->len, &sk->sk_rmem_alloc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
rfcomm_dlc_throttle(d);
@@ -84,7 +84,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
sock_set_flag(sk, SOCK_ZAPPED);
bt_accept_unlink(sk);
}
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
if (d->state == BT_CONNECTED)
rfcomm_session_getaddr(d->session,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ab1e6fcca4c5..c06dbd3938e8 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1024,7 +1024,7 @@ static void sco_conn_ready(struct sco_conn *conn)
sk->sk_state = BT_CONNECTED;
/* Wake up parent */
- parent->sk_data_ready(parent, 1);
+ parent->sk_data_ready(parent);
bh_unlock_sock(parent);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index d0cca3c65f01..7985deaff52f 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -73,7 +73,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
goto drop;
if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
- goto drop;
+ goto out;
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 91510712c7a7..4a3716102789 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -170,7 +170,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
* rejected.
*/
if (!v)
- return false;
+ goto drop;
/* If vlan tx offload is disabled on bridge device and frame was
* sent from vlan device on the bridge device, it does not have
@@ -193,7 +193,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
* vlan untagged or priority-tagged traffic belongs to.
*/
if (pvid == VLAN_N_VID)
- return false;
+ goto drop;
/* PVID is set on this port. Any untagged or priority-tagged
* ingress frame is considered to belong to this vlan.
@@ -216,7 +216,8 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
/* Frame had a valid vlan tag. See if vlan is allowed */
if (test_bit(*vid, v->vlan_bitmap))
return true;
-
+drop:
+ kfree_skb(skb);
return false;
}
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index d6be3edb7a43..e8437094d15f 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -124,7 +124,6 @@ static void caif_flow_ctrl(struct sock *sk, int mode)
static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
- int skb_len;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
@@ -153,14 +152,13 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* may be freed by other threads of control pulling packets
* from the queue.
*/
- skb_len = skb->len;
spin_lock_irqsave(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
__skb_queue_tail(list, skb);
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
else
kfree_skb(skb);
return 0;
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b703790b4e44..a1ef53c04415 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -292,10 +292,12 @@ static int is_out(const struct crush_map *map,
* @outpos: our position in that vector
* @tries: number of attempts to make
* @recurse_tries: number of attempts to have recursive chooseleaf make
- * @local_tries: localized retries
- * @local_fallback_tries: localized fallback retries
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
*/
static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket,
@@ -304,10 +306,12 @@ static int crush_choose_firstn(const struct crush_map *map,
int *out, int outpos,
unsigned int tries,
unsigned int recurse_tries,
- unsigned int local_tries,
- unsigned int local_fallback_tries,
+ unsigned int local_retries,
+ unsigned int local_fallback_retries,
int recurse_to_leaf,
- int *out2)
+ unsigned int vary_r,
+ int *out2,
+ int parent_r)
{
int rep;
unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
int itemtype;
int collide, reject;
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
- bucket->id, x, outpos, numrep);
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r);
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
do {
collide = 0;
retry_bucket = 0;
- r = rep;
+ r = rep + parent_r;
/* r' = r + f_total */
r += ftotal;
@@ -344,9 +351,9 @@ static int crush_choose_firstn(const struct crush_map *map,
reject = 1;
goto reject;
}
- if (local_fallback_tries > 0 &&
+ if (local_fallback_retries > 0 &&
flocal >= (in->size>>1) &&
- flocal > local_fallback_tries)
+ flocal > local_fallback_retries)
item = bucket_perm_choose(in, x, r);
else
item = crush_bucket_choose(in, x, r);
@@ -387,16 +394,23 @@ static int crush_choose_firstn(const struct crush_map *map,
reject = 0;
if (!collide && recurse_to_leaf) {
if (item < 0) {
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
if (crush_choose_firstn(map,
map->buckets[-1-item],
weight, weight_max,
x, outpos+1, 0,
out2, outpos,
recurse_tries, 0,
- local_tries,
- local_fallback_tries,
+ local_retries,
+ local_fallback_retries,
0,
- NULL) <= outpos)
+ vary_r,
+ NULL,
+ sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
@@ -420,14 +434,14 @@ reject:
ftotal++;
flocal++;
- if (collide && flocal <= local_tries)
+ if (collide && flocal <= local_retries)
/* retry locally a few times */
retry_bucket = 1;
- else if (local_fallback_tries > 0 &&
- flocal <= in->size + local_fallback_tries)
+ else if (local_fallback_retries > 0 &&
+ flocal <= in->size + local_fallback_retries)
/* exhaustive bucket search */
retry_bucket = 1;
- else if (ftotal <= tries)
+ else if (ftotal < tries)
/* then retry descent */
retry_descent = 1;
else
@@ -640,10 +654,20 @@ int crush_do_rule(const struct crush_map *map,
__u32 step;
int i, j;
int numrep;
- int choose_tries = map->choose_total_tries;
- int choose_local_tries = map->choose_local_tries;
- int choose_local_fallback_tries = map->choose_local_fallback_tries;
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one.
+ */
+ int choose_tries = map->choose_total_tries + 1;
int choose_leaf_tries = 0;
+ /*
+ * the local tries values were counted as "retries", though,
+ * and need no adjustment
+ */
+ int choose_local_retries = map->choose_local_tries;
+ int choose_local_fallback_retries = map->choose_local_fallback_tries;
+
+ int vary_r = map->chooseleaf_vary_r;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
@@ -676,13 +700,18 @@ int crush_do_rule(const struct crush_map *map,
break;
case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
- if (curstep->arg1 > 0)
- choose_local_tries = curstep->arg1;
+ if (curstep->arg1 >= 0)
+ choose_local_retries = curstep->arg1;
break;
case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
- if (curstep->arg1 > 0)
- choose_local_fallback_tries = curstep->arg1;
+ if (curstep->arg1 >= 0)
+ choose_local_fallback_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ if (curstep->arg1 >= 0)
+ vary_r = curstep->arg1;
break;
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
@@ -734,10 +763,12 @@ int crush_do_rule(const struct crush_map *map,
o+osize, j,
choose_tries,
recurse_tries,
- choose_local_tries,
- choose_local_fallback_tries,
+ choose_local_retries,
+ choose_local_fallback_retries,
recurse_to_leaf,
- c+osize);
+ vary_r,
+ c+osize,
+ 0);
} else {
crush_choose_indep(
map,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 258a382e75ed..10421a4b76f8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_client *client = s->private;
+ struct ceph_osdmap *map = client->osdc.osdmap;
struct rb_node *n;
- if (client->osdc.osdmap == NULL)
+ if (map == NULL)
return 0;
- seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+
+ seq_printf(s, "epoch %d\n", map->epoch);
seq_printf(s, "flags%s%s\n",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
- " NEARFULL" : "",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
- " FULL" : "");
- for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+ (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
+ (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
+
+ for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pool =
rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
- (unsigned long long)pool->id, pool->pg_num,
- pool->pg_num_mask);
+
+ seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
+ pool->id, pool->pg_num, pool->pg_num_mask,
+ pool->read_tier, pool->write_tier);
}
- for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
- struct ceph_entity_addr *addr =
- &client->osdc.osdmap->osd_addr[i];
- int state = client->osdc.osdmap->osd_state[i];
+ for (i = 0; i < map->max_osd; i++) {
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+ int state = map->osd_state[i];
char sb[64];
- seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+ seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
i, ceph_pr_addr(&addr->in_addr),
- ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
- ceph_osdmap_state_str(sb, sizeof(sb), state));
+ ((map->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state),
+ ((ceph_get_primary_affinity(map, i)*100) >> 16));
+ }
+ for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_temp.len; i++)
+ seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+ pg->pg_temp.osds[i]);
+ seq_printf(s, "]\n");
}
+ for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
+ pg->pgid.seed, pg->primary_temp.osd);
+ }
+
return 0;
}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 30efc5c18622..dac7f9b98687 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -383,7 +383,7 @@ static void con_sock_state_closed(struct ceph_connection *con)
*/
/* data available on socket, or listen socket received a connect */
-static void ceph_sock_data_ready(struct sock *sk, int count_unused)
+static void ceph_sock_data_ready(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
if (atomic_read(&con->msgr->stopping)) {
@@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
if (!bytes || cursor->page_offset)
return false; /* more bytes to process in the current page */
+ if (!cursor->resid)
+ return false; /* no more data */
+
/* Move on to the next page; offset is already at 0 */
BUG_ON(cursor->page_index >= cursor->page_count);
@@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
if (!bytes || cursor->offset & ~PAGE_MASK)
return false; /* more bytes to process in the current page */
+ if (!cursor->resid)
+ return false; /* no more data */
+
/* Move on to the next page */
BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 82750f915865..b0dfce77656a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
case CEPH_OSD_OP_OMAPCLEAR:
case CEPH_OSD_OP_OMAPRMKEYS:
case CEPH_OSD_OP_OMAP_CMP:
+ case CEPH_OSD_OP_SETALLOCHINT:
case CEPH_OSD_OP_CLONERANGE:
case CEPH_OSD_OP_ASSERT_SRC_VERSION:
case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_watch_init);
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ CEPH_OSD_OP_SETALLOCHINT);
+
+ op->alloc_hint.expected_object_size = expected_object_size;
+ op->alloc_hint.expected_write_size = expected_write_size;
+
+ /*
+ * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+ * not worth a feature bit. Set FAILOK per-op flag to make
+ * sure older osds don't trip over an unsupported opcode.
+ */
+ op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
+
static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
struct ceph_osd_data *osd_data)
{
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->watch.ver = cpu_to_le64(src->watch.ver);
dst->watch.flag = src->watch.flag;
break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ dst->alloc_hint.expected_object_size =
+ cpu_to_le64(src->alloc_hint.expected_object_size);
+ dst->alloc_hint.expected_write_size =
+ cpu_to_le64(src->alloc_hint.expected_write_size);
+ break;
default:
pr_err("unsupported osd opcode %s\n",
ceph_osd_op_name(src->op));
@@ -688,7 +715,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
return 0;
}
+
dst->op = cpu_to_le16(src->op);
+ dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->payload_len);
return request_data_len;
@@ -1304,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc,
{
struct ceph_pg pgid;
int acting[CEPH_PG_MAX_SIZE];
- int o = -1, num = 0;
+ int num, o;
int err;
bool was_paused;
@@ -1317,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc,
}
req->r_pgid = pgid;
- err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
- if (err > 0) {
- o = acting[0];
- num = err;
- }
+ num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
+ if (num < 0)
+ num = 0;
was_paused = req->r_paused;
req->r_paused = __req_should_be_paused(osdc, req);
@@ -2033,7 +2060,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
int skipped_map = 0;
dout("taking full map %u len %d\n", epoch, maplen);
- newmap = osdmap_decode(&p, p+maplen);
+ newmap = ceph_osdmap_decode(&p, p+maplen);
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index aade4a5c1c07..e632b5a52f5b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -343,7 +343,7 @@ bad:
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
+ * to a set of osds) and primary_temp (explicit primary setting)
*/
static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
{
@@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
kfree(pi);
}
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
{
u8 ev, cv;
unsigned len, num;
@@ -587,7 +587,7 @@ bad:
return -EINVAL;
}
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
{
struct ceph_pg_pool_info *pi;
u32 num, len;
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
rb_erase(&pg->node, &map->pg_temp);
kfree(pg);
}
+ while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->primary_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->primary_temp);
+ kfree(pg);
+ }
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
struct ceph_pg_pool_info *pi =
rb_entry(rb_first(&map->pg_pools),
@@ -642,186 +649,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kfree(map->osd_state);
kfree(map->osd_weight);
kfree(map->osd_addr);
+ kfree(map->osd_primary_affinity);
kfree(map);
}
/*
- * adjust max osd value. reallocate arrays.
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
*/
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
{
u8 *state;
- struct ceph_entity_addr *addr;
u32 *weight;
+ struct ceph_entity_addr *addr;
+ int i;
- state = kcalloc(max, sizeof(*state), GFP_NOFS);
- addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
- weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
- if (state == NULL || addr == NULL || weight == NULL) {
+ state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
+ weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
+ addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
+ if (!state || !weight || !addr) {
kfree(state);
- kfree(addr);
kfree(weight);
+ kfree(addr);
+
return -ENOMEM;
}
- /* copy old? */
- if (map->osd_state) {
- memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
- memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
- memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
- kfree(map->osd_state);
- kfree(map->osd_addr);
- kfree(map->osd_weight);
+ for (i = map->max_osd; i < max; i++) {
+ state[i] = 0;
+ weight[i] = CEPH_OSD_OUT;
+ memset(addr + i, 0, sizeof(*addr));
}
map->osd_state = state;
map->osd_weight = weight;
map->osd_addr = addr;
+
+ if (map->osd_primary_affinity) {
+ u32 *affinity;
+
+ affinity = krealloc(map->osd_primary_affinity,
+ max*sizeof(*affinity), GFP_NOFS);
+ if (!affinity)
+ return -ENOMEM;
+
+ for (i = map->max_osd; i < max; i++)
+ affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+ map->osd_primary_affinity = affinity;
+ }
+
map->max_osd = max;
+
return 0;
}
+#define OSDMAP_WRAPPER_COMPAT_VER 7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
+
/*
- * decode a full map.
+ * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
*/
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
+static int get_osdmap_client_data_v(void **p, void *end,
+ const char *prefix, u8 *v)
{
- struct ceph_osdmap *map;
- u16 version;
- u32 len, max, i;
- int err = -EINVAL;
- void *start = *p;
- struct ceph_pg_pool_info *pi;
+ u8 struct_v;
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ if (struct_v >= 7) {
+ u8 struct_compat;
+
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+ pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
+ struct_v, struct_compat,
+ OSDMAP_WRAPPER_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore wrapper struct_len */
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+ pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+ struct_v, struct_compat,
+ OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore client data struct_len */
+ } else {
+ u16 version;
+
+ *p -= 1;
+ ceph_decode_16_safe(p, end, version, e_inval);
+ if (version < 6) {
+ pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
+ prefix);
+ return -EINVAL;
+ }
- dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+ /* old osdmap enconding */
+ struct_v = 0;
+ }
- map = kzalloc(sizeof(*map), GFP_NOFS);
- if (map == NULL)
- return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
+ *v = struct_v;
+ return 0;
- ceph_decode_16_safe(p, end, version, bad);
- if (version > 6) {
- pr_warning("got unknown v %d > 6 of osdmap\n", version);
- goto bad;
+e_inval:
+ return -EINVAL;
+}
+
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg_pool_info *pi;
+ u64 pool;
+ int ret;
+
+ ceph_decode_64_safe(p, end, pool, e_inval);
+
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!incremental || !pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ return -ENOMEM;
+
+ pi->id = pool;
+
+ ret = __insert_pg_pool(&map->pg_pools, pi);
+ if (ret) {
+ kfree(pi);
+ return ret;
+ }
+ }
+
+ ret = decode_pool(p, end, pi);
+ if (ret)
+ return ret;
}
- if (version < 6) {
- pr_warning("got old v %d < 6 of osdmap\n", version);
- goto bad;
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, false);
+}
+
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, true);
+}
+
+static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg pgid;
+ u32 len, i;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+
+ ret = __remove_pg_mapping(&map->pg_temp, pgid);
+ BUG_ON(!incremental && ret != -ENOENT);
+
+ if (!incremental || len > 0) {
+ struct ceph_pg_mapping *pg;
+
+ ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+
+ if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+ return -EINVAL;
+
+ pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
+ if (!pg)
+ return -ENOMEM;
+
+ pg->pgid = pgid;
+ pg->pg_temp.len = len;
+ for (i = 0; i < len; i++)
+ pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+ ret = __insert_pg_mapping(pg, &map->pg_temp);
+ if (ret) {
+ kfree(pg);
+ return ret;
+ }
+ }
}
- ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pg_temp(p, end, map, false);
+}
+
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pg_temp(p, end, map, true);
+}
+
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg pgid;
+ u32 osd;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+
+ ret = __remove_pg_mapping(&map->primary_temp, pgid);
+ BUG_ON(!incremental && ret != -ENOENT);
+
+ if (!incremental || osd != (u32)-1) {
+ struct ceph_pg_mapping *pg;
+
+ pg = kzalloc(sizeof(*pg), GFP_NOFS);
+ if (!pg)
+ return -ENOMEM;
+
+ pg->pgid = pgid;
+ pg->primary_temp.osd = osd;
+
+ ret = __insert_pg_mapping(pg, &map->primary_temp);
+ if (ret) {
+ kfree(pg);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return __decode_primary_temp(p, end, map, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+ BUG_ON(osd >= map->max_osd);
+
+ if (!map->osd_primary_affinity)
+ return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+ return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+ BUG_ON(osd >= map->max_osd);
+
+ if (!map->osd_primary_affinity) {
+ int i;
+
+ map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+ GFP_NOFS);
+ if (!map->osd_primary_affinity)
+ return -ENOMEM;
+
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_primary_affinity[i] =
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ }
+
+ map->osd_primary_affinity[osd] = aff;
+
+ return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len == 0) {
+ kfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
+ return 0;
+ }
+ if (len != map->max_osd)
+ goto e_inval;
+
+ ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+ for (i = 0; i < map->max_osd; i++) {
+ int ret;
+
+ ret = set_primary_affinity(map, i, ceph_decode_32(p));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ u32 osd, aff;
+ int ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ ceph_decode_32_safe(p, end, aff, e_inval);
+
+ ret = set_primary_affinity(map, osd, aff);
+ if (ret)
+ return ret;
+
+ pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * decode a full map.
+ */
+static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+{
+ u8 struct_v;
+ u32 epoch = 0;
+ void *start = *p;
+ u32 max;
+ u32 len, i;
+ int err;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
+ err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+ if (err)
+ goto bad;
+
+ /* fsid, epoch, created, modified */
+ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+ sizeof(map->created) + sizeof(map->modified), e_inval);
ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
- map->epoch = ceph_decode_32(p);
+ epoch = map->epoch = ceph_decode_32(p);
ceph_decode_copy(p, &map->created, sizeof(map->created));
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
- ceph_decode_32_safe(p, end, max, bad);
- while (max--) {
- ceph_decode_need(p, end, 8 + 2, bad);
- err = -ENOMEM;
- pi = kzalloc(sizeof(*pi), GFP_NOFS);
- if (!pi)
- goto bad;
- pi->id = ceph_decode_64(p);
- err = __decode_pool(p, end, pi);
- if (err < 0) {
- kfree(pi);
- goto bad;
- }
- __insert_pg_pool(&map->pg_pools, pi);
- }
+ /* pools */
+ err = decode_pools(p, end, map);
+ if (err)
+ goto bad;
- err = __decode_pool_names(p, end, map);
- if (err < 0) {
- dout("fail to decode pool names");
+ /* pool_name */
+ err = decode_pool_names(p, end, map);
+ if (err)
goto bad;
- }
- ceph_decode_32_safe(p, end, map->pool_max, bad);
+ ceph_decode_32_safe(p, end, map->pool_max, e_inval);
- ceph_decode_32_safe(p, end, map->flags, bad);
+ ceph_decode_32_safe(p, end, map->flags, e_inval);
- max = ceph_decode_32(p);
+ /* max_osd */
+ ceph_decode_32_safe(p, end, max, e_inval);
/* (re)alloc osd arrays */
err = osdmap_set_max_osd(map, max);
- if (err < 0)
+ if (err)
goto bad;
- dout("osdmap_decode max_osd = %d\n", map->max_osd);
- /* osds */
- err = -EINVAL;
+ /* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
map->max_osd*(1 + sizeof(*map->osd_weight) +
- sizeof(*map->osd_addr)), bad);
- *p += 4; /* skip length field (should match max) */
+ sizeof(*map->osd_addr)), e_inval);
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
ceph_decode_copy(p, map->osd_state, map->max_osd);
- *p += 4; /* skip length field (should match max) */
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
for (i = 0; i < map->max_osd; i++)
map->osd_weight[i] = ceph_decode_32(p);
- *p += 4; /* skip length field (should match max) */
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
for (i = 0; i < map->max_osd; i++)
ceph_decode_addr(&map->osd_addr[i]);
/* pg_temp */
- ceph_decode_32_safe(p, end, len, bad);
- for (i = 0; i < len; i++) {
- int n, j;
- struct ceph_pg pgid;
- struct ceph_pg_mapping *pg;
+ err = decode_pg_temp(p, end, map);
+ if (err)
+ goto bad;
- err = ceph_decode_pgid(p, end, &pgid);
+ /* primary_temp */
+ if (struct_v >= 1) {
+ err = decode_primary_temp(p, end, map);
if (err)
goto bad;
- ceph_decode_need(p, end, sizeof(u32), bad);
- n = ceph_decode_32(p);
- err = -EINVAL;
- if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
- goto bad;
- ceph_decode_need(p, end, n * sizeof(u32), bad);
- err = -ENOMEM;
- pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
- if (!pg)
- goto bad;
- pg->pgid = pgid;
- pg->len = n;
- for (j = 0; j < n; j++)
- pg->osds[j] = ceph_decode_32(p);
+ }
- err = __insert_pg_mapping(pg, &map->pg_temp);
+ /* primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_primary_affinity(p, end, map);
if (err)
goto bad;
- dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
- len);
+ } else {
+ /* XXX can this happen? */
+ kfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
}
/* crush */
- ceph_decode_32_safe(p, end, len, bad);
- dout("osdmap_decode crush len %d from off 0x%x\n", len,
- (int)(*p - start));
- ceph_decode_need(p, end, len, bad);
- map->crush = crush_decode(*p, end);
- *p += len;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ map->crush = crush_decode(*p, min(*p + len, end));
if (IS_ERR(map->crush)) {
err = PTR_ERR(map->crush);
map->crush = NULL;
goto bad;
}
+ *p += len;
- /* ignore the rest of the map */
+ /* ignore the rest */
*p = end;
- dout("osdmap_decode done %p %p\n", *p, end);
- return map;
+ dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
+ return 0;
+e_inval:
+ err = -EINVAL;
bad:
- dout("osdmap_decode fail err %d\n", err);
- ceph_osdmap_destroy(map);
- return ERR_PTR(err);
+ pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+ err, epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ return err;
+}
+
+/*
+ * Allocate and decode a full map.
+ */
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+{
+ struct ceph_osdmap *map;
+ int ret;
+
+ map = kzalloc(sizeof(*map), GFP_NOFS);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->pg_temp = RB_ROOT;
+ map->primary_temp = RB_ROOT;
+ mutex_init(&map->crush_scratch_mutex);
+
+ ret = osdmap_decode(p, end, map);
+ if (ret) {
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(ret);
+ }
+
+ return map;
}
/*
@@ -840,17 +1177,18 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
__s64 new_pool_max;
__s32 new_flags, max;
void *start = *p;
- int err = -EINVAL;
- u16 version;
+ int err;
+ u8 struct_v;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
- ceph_decode_16_safe(p, end, version, bad);
- if (version != 6) {
- pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
+ err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
+ if (err)
goto bad;
- }
- ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
- bad);
+ /* fsid, epoch, modified, new_pool_max, new_flags */
+ ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
+ sizeof(u64) + sizeof(u32), e_inval);
ceph_decode_copy(p, &fsid, sizeof(fsid));
epoch = ceph_decode_32(p);
BUG_ON(epoch != map->epoch+1);
@@ -859,21 +1197,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
new_flags = ceph_decode_32(p);
/* full map? */
- ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
- return osdmap_decode(p, min(*p+len, end));
+ return ceph_osdmap_decode(p, min(*p+len, end));
}
/* new crush? */
- ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
- dout("apply_incremental new crush map len %d, %p to %p\n",
- len, *p, end);
newcrush = crush_decode(*p, min(*p+len, end));
- if (IS_ERR(newcrush))
- return ERR_CAST(newcrush);
+ if (IS_ERR(newcrush)) {
+ err = PTR_ERR(newcrush);
+ newcrush = NULL;
+ goto bad;
+ }
*p += len;
}
@@ -883,13 +1222,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
if (new_pool_max >= 0)
map->pool_max = new_pool_max;
- ceph_decode_need(p, end, 5*sizeof(u32), bad);
-
/* new max? */
- max = ceph_decode_32(p);
+ ceph_decode_32_safe(p, end, max, e_inval);
if (max >= 0) {
err = osdmap_set_max_osd(map, max);
- if (err < 0)
+ if (err)
goto bad;
}
@@ -902,51 +1239,34 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
newcrush = NULL;
}
- /* new_pool */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- struct ceph_pg_pool_info *pi;
+ /* new_pools */
+ err = decode_new_pools(p, end, map);
+ if (err)
+ goto bad;
- ceph_decode_64_safe(p, end, pool, bad);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
- if (!pi) {
- pi = kzalloc(sizeof(*pi), GFP_NOFS);
- if (!pi) {
- err = -ENOMEM;
- goto bad;
- }
- pi->id = pool;
- __insert_pg_pool(&map->pg_pools, pi);
- }
- err = __decode_pool(p, end, pi);
- if (err < 0)
- goto bad;
- }
- if (version >= 5) {
- err = __decode_pool_names(p, end, map);
- if (err < 0)
- goto bad;
- }
+ /* new_pool_names */
+ err = decode_pool_names(p, end, map);
+ if (err)
+ goto bad;
/* old_pool */
- ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
struct ceph_pg_pool_info *pi;
- ceph_decode_64_safe(p, end, pool, bad);
+ ceph_decode_64_safe(p, end, pool, e_inval);
pi = __lookup_pg_pool(&map->pg_pools, pool);
if (pi)
__remove_pg_pool(&map->pg_pools, pi);
}
/* new_up */
- err = -EINVAL;
- ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
u32 osd;
struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, bad);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
ceph_decode_addr(&addr);
pr_info("osd%d up\n", osd);
BUG_ON(osd >= map->max_osd);
@@ -955,11 +1275,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
}
/* new_state */
- ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
u32 osd;
u8 xorstate;
- ceph_decode_32_safe(p, end, osd, bad);
+ ceph_decode_32_safe(p, end, osd, e_inval);
xorstate = **(u8 **)p;
(*p)++; /* clean flag */
if (xorstate == 0)
@@ -971,10 +1291,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
}
/* new_weight */
- ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
osd = ceph_decode_32(p);
off = ceph_decode_32(p);
pr_info("osd%d weight 0x%x %s\n", osd, off,
@@ -985,56 +1305,35 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
}
/* new_pg_temp */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- struct ceph_pg_mapping *pg;
- int j;
- struct ceph_pg pgid;
- u32 pglen;
+ err = decode_new_pg_temp(p, end, map);
+ if (err)
+ goto bad;
- err = ceph_decode_pgid(p, end, &pgid);
+ /* new_primary_temp */
+ if (struct_v >= 1) {
+ err = decode_new_primary_temp(p, end, map);
if (err)
goto bad;
- ceph_decode_need(p, end, sizeof(u32), bad);
- pglen = ceph_decode_32(p);
- if (pglen) {
- ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-
- /* removing existing (if any) */
- (void) __remove_pg_mapping(&map->pg_temp, pgid);
+ }
- /* insert */
- err = -EINVAL;
- if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
- goto bad;
- err = -ENOMEM;
- pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
- if (!pg)
- goto bad;
- pg->pgid = pgid;
- pg->len = pglen;
- for (j = 0; j < pglen; j++)
- pg->osds[j] = ceph_decode_32(p);
- err = __insert_pg_mapping(pg, &map->pg_temp);
- if (err) {
- kfree(pg);
- goto bad;
- }
- dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
- pgid.seed, pglen);
- } else {
- /* remove */
- __remove_pg_mapping(&map->pg_temp, pgid);
- }
+ /* new_primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_new_primary_affinity(p, end, map);
+ if (err)
+ goto bad;
}
/* ignore the rest */
*p = end;
+
+ dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
return map;
+e_inval:
+ err = -EINVAL;
bad:
- pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
- epoch, (int)(*p - start), *p, start, end);
+ pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+ err, epoch, (int)(*p - start), *p, start, end);
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
@@ -1142,61 +1441,249 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
}
EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
-static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
- int *result, int result_max,
- const __u32 *weight, int weight_max)
+static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
+ int *result, int result_max,
+ const __u32 *weight, int weight_max)
{
- int scratch[result_max * 3];
+ int r;
- return crush_do_rule(map, ruleno, x, result, result_max,
- weight, weight_max, scratch);
+ BUG_ON(result_max > CEPH_PG_MAX_SIZE);
+
+ mutex_lock(&map->crush_scratch_mutex);
+ r = crush_do_rule(map->crush, ruleno, x, result, result_max,
+ weight, weight_max, map->crush_scratch_ary);
+ mutex_unlock(&map->crush_scratch_mutex);
+
+ return r;
}
/*
- * Calculate raw osd vector for the given pgid. Return pointer to osd
- * array, or NULL on failure.
+ * Calculate raw (crush) set for given pgid.
+ *
+ * Return raw set length, or error.
*/
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *num)
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pool,
+ struct ceph_pg pgid, u32 pps, int *osds)
{
- struct ceph_pg_mapping *pg;
- struct ceph_pg_pool_info *pool;
int ruleno;
- int r;
- u32 pps;
+ int len;
- pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
- if (!pool)
- return NULL;
+ /* crush */
+ ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+ pool->type, pool->size);
+ if (ruleno < 0) {
+ pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+ pgid.pool, pool->crush_ruleset, pool->type,
+ pool->size);
+ return -ENOENT;
+ }
- /* pg_temp? */
+ len = do_crush(osdmap, ruleno, pps, osds,
+ min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+ osdmap->osd_weight, osdmap->max_osd);
+ if (len < 0) {
+ pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+ len, ruleno, pgid.pool, pool->crush_ruleset,
+ pool->type, pool->size);
+ return len;
+ }
+
+ return len;
+}
+
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length. *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pool,
+ int *osds, int len, int *primary)
+{
+ int up_primary = -1;
+ int i;
+
+ if (ceph_can_shift_osds(pool)) {
+ int removed = 0;
+
+ for (i = 0; i < len; i++) {
+ if (ceph_osd_is_down(osdmap, osds[i])) {
+ removed++;
+ continue;
+ }
+ if (removed)
+ osds[i - removed] = osds[i];
+ }
+
+ len -= removed;
+ if (len > 0)
+ up_primary = osds[0];
+ } else {
+ for (i = len - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, osds[i]))
+ osds[i] = CRUSH_ITEM_NONE;
+ else
+ up_primary = osds[i];
+ }
+ }
+
+ *primary = up_primary;
+ return len;
+}
+
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+ struct ceph_pg_pool_info *pool,
+ int *osds, int len, int *primary)
+{
+ int i;
+ int pos = -1;
+
+ /*
+ * Do we have any non-default primary_affinity values for these
+ * osds?
+ */
+ if (!osdmap->osd_primary_affinity)
+ return;
+
+ for (i = 0; i < len; i++) {
+ if (osds[i] != CRUSH_ITEM_NONE &&
+ osdmap->osd_primary_affinity[i] !=
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ break;
+ }
+ }
+ if (i == len)
+ return;
+
+ /*
+ * Pick the primary. Feed both the seed (for the pg) and the
+ * osd into the hash/rng so that a proportional fraction of an
+ * osd's pgs get rejected as primary.
+ */
+ for (i = 0; i < len; i++) {
+ int osd;
+ u32 aff;
+
+ osd = osds[i];
+ if (osd == CRUSH_ITEM_NONE)
+ continue;
+
+ aff = osdmap->osd_primary_affinity[osd];
+ if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+ (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ pps, osd) >> 16) >= aff) {
+ /*
+ * We chose not to use this primary. Note it
+ * anyway as a fallback in case we don't pick
+ * anyone else, but keep looking.
+ */
+ if (pos < 0)
+ pos = i;
+ } else {
+ pos = i;
+ break;
+ }
+ }
+ if (pos < 0)
+ return;
+
+ *primary = osds[pos];
+
+ if (ceph_can_shift_osds(pool) && pos > 0) {
+ /* move the new primary to the front */
+ for (i = pos; i > 0; i--)
+ osds[i] = osds[i - 1];
+ osds[0] = *primary;
+ }
+}
+
+/*
+ * Given up set, apply pg_temp and primary_temp mappings.
+ *
+ * Return acting set length. *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+ int *osds, int len, int *primary)
+{
+ struct ceph_pg_mapping *pg;
+ int temp_len;
+ int temp_primary;
+ int i;
+
+ /* raw_pg -> pg */
pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
pool->pg_num_mask);
+
+ /* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
- *num = pg->len;
- return pg->osds;
+ temp_len = 0;
+ temp_primary = -1;
+
+ for (i = 0; i < pg->pg_temp.len; i++) {
+ if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+ if (ceph_can_shift_osds(pool))
+ continue;
+ else
+ osds[temp_len++] = CRUSH_ITEM_NONE;
+ } else {
+ osds[temp_len++] = pg->pg_temp.osds[i];
+ }
+ }
+
+ /* apply pg_temp's primary */
+ for (i = 0; i < temp_len; i++) {
+ if (osds[i] != CRUSH_ITEM_NONE) {
+ temp_primary = osds[i];
+ break;
+ }
+ }
+ } else {
+ temp_len = len;
+ temp_primary = *primary;
}
- /* crush */
- ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
- pool->type, pool->size);
- if (ruleno < 0) {
- pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
- pgid.pool, pool->crush_ruleset, pool->type,
- pool->size);
- return NULL;
+ /* primary_temp? */
+ pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+ if (pg)
+ temp_primary = pg->primary_temp.osd;
+
+ *primary = temp_primary;
+ return temp_len;
+}
+
+/*
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error. *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *osds, int *primary)
+{
+ struct ceph_pg_pool_info *pool;
+ u32 pps;
+ int len;
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+ if (!pool) {
+ *primary = -1;
+ return -ENOENT;
}
if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
- /* hash pool id and seed sothat pool PGs do not overlap */
+ /* hash pool id and seed so that pool PGs do not overlap */
pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
ceph_stable_mod(pgid.seed, pool->pgp_num,
pool->pgp_num_mask),
pgid.pool);
} else {
/*
- * legacy ehavior: add ps and pool together. this is
+ * legacy behavior: add ps and pool together. this is
* not a great approach because the PGs from each pool
* will overlap on top of each other: 0.5 == 1.4 ==
* 2.3 == ...
@@ -1205,38 +1692,20 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
pool->pgp_num_mask) +
(unsigned)pgid.pool;
}
- r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
- osds, min_t(int, pool->size, *num),
- osdmap->osd_weight, osdmap->max_osd);
- if (r < 0) {
- pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
- " size %d\n", r, pgid.pool, pool->crush_ruleset,
- pool->type, pool->size);
- return NULL;
+
+ len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
+ if (len < 0) {
+ *primary = -1;
+ return len;
}
- *num = r;
- return osds;
-}
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *acting)
-{
- int rawosds[CEPH_PG_MAX_SIZE], *osds;
- int i, o, num = CEPH_PG_MAX_SIZE;
+ len = raw_to_up_osds(osdmap, pool, osds, len, primary);
- osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
- if (!osds)
- return -1;
+ apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
- /* primary is first up osd */
- o = 0;
- for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i]))
- acting[o++] = osds[i];
- return o;
+ len = apply_temps(osdmap, pool, pgid, osds, len, primary);
+
+ return len;
}
/*
@@ -1244,17 +1713,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
*/
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
{
- int rawosds[CEPH_PG_MAX_SIZE], *osds;
- int i, num = CEPH_PG_MAX_SIZE;
+ int osds[CEPH_PG_MAX_SIZE];
+ int primary;
- osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
- if (!osds)
- return -1;
+ ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
- /* primary is first up osd */
- for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i]))
- return osds[i];
- return -1;
+ return primary;
}
EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/core/dev.c b/net/core/dev.c
index 14dac0654f28..5b3042e69f85 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2284,7 +2284,7 @@ EXPORT_SYMBOL(skb_checksum_help);
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
- int vlan_depth = ETH_HLEN;
+ int vlan_depth = skb->mac_len;
/* Tunnel gso handlers can set protocol to ethernet. */
if (type == htons(ETH_P_TEB)) {
diff --git a/net/core/filter.c b/net/core/filter.c
index e08b3822c72a..cd58614660cf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -600,6 +600,9 @@ static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
if (skb_is_nonlinear(skb))
return 0;
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
if (A > skb->len - sizeof(struct nlattr))
return 0;
@@ -618,11 +621,14 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
if (skb_is_nonlinear(skb))
return 0;
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
if (A > skb->len - sizeof(struct nlattr))
return 0;
nla = (struct nlattr *) &skb->data[A];
- if (nla->nla_len > A - skb->len)
+ if (nla->nla_len > skb->len - A)
return 0;
nla = nla_find_nested(nla, X);
@@ -1737,7 +1743,6 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
[BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS,
diff --git a/net/core/flow.c b/net/core/flow.c
index 31cfb365e0c6..a0348fde1fdf 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -455,6 +455,8 @@ int flow_cache_init(struct net *net)
if (!fc->percpu)
return -ENOMEM;
+ cpu_notifier_register_begin();
+
for_each_online_cpu(i) {
if (flow_cache_cpu_prepare(fc, i))
goto err;
@@ -462,7 +464,9 @@ int flow_cache_init(struct net *net)
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
};
- register_hotcpu_notifier(&fc->hotcpu_notifier);
+ __register_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpu_notifier_register_done();
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -478,6 +482,8 @@ err:
fcp->hash_table = NULL;
}
+ cpu_notifier_register_done();
+
free_percpu(fc->percpu);
fc->percpu = NULL;
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 719efd541668..22931e1b99b4 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return css_cls_state(task_css(p, net_cls_subsys_id));
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
}
EXPORT_SYMBOL_GPL(task_cls_state);
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
void *v = (void *)(unsigned long)cs->classid;
struct task_struct *p;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_classid, v);
task_unlock(p);
@@ -102,19 +102,10 @@ static struct cftype ss_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys net_cls_subsys = {
- .name = "net_cls",
+struct cgroup_subsys net_cls_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = cgrp_attach,
- .subsys_id = net_cls_subsys_id,
.base_cftypes = ss_files,
- .module = THIS_MODULE,
};
-
-static int __init init_netclassid_cgroup(void)
-{
- return cgroup_load_subsys(&net_cls_subsys);
-}
-__initcall(init_netclassid_cgroup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9043caedcd08..3825f669147b 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -186,7 +186,7 @@ static int read_priomap(struct seq_file *sf, void *v)
}
static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+ char *buffer)
{
char devname[IFNAMSIZ + 1];
struct net_device *dev;
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
struct task_struct *p;
void *v = (void *)(unsigned long)css->cgroup->id;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
@@ -244,15 +244,12 @@ static struct cftype ss_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys net_prio_subsys = {
- .name = "net_prio",
+struct cgroup_subsys net_prio_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = net_prio_attach,
- .subsys_id = net_prio_subsys_id,
.base_cftypes = ss_files,
- .module = THIS_MODULE,
};
static int netprio_device_event(struct notifier_block *unused,
@@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {
static int __init init_cgroup_netprio(void)
{
- int ret;
-
- ret = cgroup_load_subsys(&net_prio_subsys);
- if (ret)
- goto out;
-
register_netdevice_notifier(&netprio_device_notifier);
-
-out:
- return ret;
-}
-
-static void __exit exit_cgroup_netprio(void)
-{
- struct netprio_map *old;
- struct net_device *dev;
-
- unregister_netdevice_notifier(&netprio_device_notifier);
-
- cgroup_unload_subsys(&net_prio_subsys);
-
- rtnl_lock();
- for_each_netdev(&init_net, dev) {
- old = rtnl_dereference(dev->priomap);
- RCU_INIT_POINTER(dev->priomap, NULL);
- if (old)
- kfree_rcu(old, rcu);
- }
- rtnl_unlock();
+ return 0;
}
-module_init(init_cgroup_netprio);
-module_exit(exit_cgroup_netprio);
+subsys_initcall(init_cgroup_netprio);
MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d068ec25db1e..0304f981f7ff 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3338,7 +3338,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
queue_map = skb_get_queue_mapping(pkt_dev->skb);
txq = netdev_get_tx_queue(odev, queue_map);
- __netif_tx_lock_bh(txq);
+ local_bh_disable();
+
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
ret = NETDEV_TX_BUSY;
@@ -3374,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
}
unlock:
- __netif_tx_unlock_bh(txq);
+ HARD_TX_UNLOCK(odev, txq);
+
+ local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 30c7d35dd862..1b62343f5837 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3458,8 +3458,6 @@ static void sock_rmem_free(struct sk_buff *skb)
*/
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
- int len = skb->len;
-
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned int)sk->sk_rcvbuf)
return -ENOMEM;
@@ -3474,7 +3472,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -3937,12 +3935,14 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
- unsigned int hdr_len;
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- hdr_len = tcp_hdrlen(skb);
- else
- hdr_len = sizeof(struct udphdr);
- return hdr_len + shinfo->gso_size;
+ return tcp_hdrlen(skb) + shinfo->gso_size;
+
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return shinfo->gso_size;
}
EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
diff --git a/net/core/sock.c b/net/core/sock.c
index c0fc6bdad1e3..b4fff008136f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -428,7 +428,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_rcv_skb);
@@ -2196,7 +2196,7 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 14cdafad7a90..3c8ec7d4a34e 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -28,7 +28,7 @@ static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
static void dccp_fin(struct sock *sk, struct sk_buff *skb)
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 9e2f78bc1553..c69eb9c4fbb8 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -237,7 +237,7 @@ int dccp_child_process(struct sock *parent, struct sock *child,
/* Wakeup parent, send SIGIO */
if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index c344163e6ac0..fe5f01485d33 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -585,7 +585,6 @@ out:
static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue)
{
int err;
- int skb_len;
/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with -W --ANK
@@ -600,12 +599,11 @@ static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig
if (err)
goto out;
- skb_len = skb->len;
skb_set_owner_r(skb, sk);
skb_queue_tail(queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
out:
return err;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index ec4f762efda5..94213c891565 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -463,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
+ dev->type = ARPHRD_IPGRE;
ip_tunnel_setup(dev, ipgre_net_id);
}
@@ -501,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
- dev->type = ARPHRD_IPGRE;
dev->flags = IFF_NOARP;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
dev->addr_len = 4;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 687ddef4e574..afcee51b90ed 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -337,6 +337,7 @@ static const struct net_device_ops vti_netdev_ops = {
static void vti_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &vti_netdev_ops;
+ dev->type = ARPHRD_TUNNEL;
ip_tunnel_setup(dev, vti_net_id);
}
@@ -348,7 +349,6 @@ static int vti_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
- dev->type = ARPHRD_TUNNEL;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
dev->mtu = ETH_DATA_LEN;
dev->flags = IFF_NOARP;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index f4b19e5dde54..8210964a9f19 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -252,26 +252,33 @@ int ping_init_sock(struct sock *sk)
{
struct net *net = sock_net(sk);
kgid_t group = current_egid();
- struct group_info *group_info = get_current_groups();
- int i, j, count = group_info->ngroups;
+ struct group_info *group_info;
+ int i, j, count;
kgid_t low, high;
+ int ret = 0;
inet_get_ping_group_range_net(net, &low, &high);
if (gid_lte(low, group) && gid_lte(group, high))
return 0;
+ group_info = get_current_groups();
+ count = group_info->ngroups;
for (i = 0; i < group_info->nblocks; i++) {
int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
for (j = 0; j < cp_count; j++) {
kgid_t gid = group_info->blocks[i][j];
if (gid_lte(low, gid) && gid_lte(gid, high))
- return 0;
+ goto out_release_group;
}
count -= cp_count;
}
- return -EACCES;
+ ret = -EACCES;
+
+out_release_group:
+ put_group_info(group_info);
+ return ret;
}
EXPORT_SYMBOL_GPL(ping_init_sock);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1be9e990514d..20a59c388e6e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -188,7 +188,7 @@ const __u8 ip_tos2prio[16] = {
EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -2357,7 +2357,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
}
} else
#endif
- if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+ if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
goto nla_put_failure;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e1661f46fd19..d6b46eb2f94c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4413,7 +4413,7 @@ queue_and_out:
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return;
}
@@ -4914,7 +4914,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
BUG();
tp->urg_data = TCP_URG_VALID | tmp;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
}
}
@@ -5000,11 +5000,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
(tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
(atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
} else if (chunk > 0) {
tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
out:
return copied_early;
@@ -5275,7 +5275,7 @@ no_ack:
#endif
if (eaten)
kfree_skb_partial(skb, fragstolen);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return;
}
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6379894ec210..438f3b95143d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1434,7 +1434,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
tp->syn_data_acked = 1;
}
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
bh_unlock_sock(child);
sock_put(child);
WARN_ON(req->sk == NULL);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..d4f015ad6c84 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
}
static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+ char *buffer)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
unsigned long long val;
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
static int __init tcp_memcontrol_init(void)
{
- WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
return 0;
}
__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ca788ada5bd3..05c1b155251d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -745,7 +745,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
skb->len);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5015c50a5ba7..5ea462eacd9f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1338,7 +1338,7 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu)
- return mtu;
+ goto out;
mtu = IPV6_MIN_MTU;
@@ -1348,7 +1348,8 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
mtu = idev->cnf.mtu6;
rcu_read_unlock();
- return mtu;
+out:
+ return min_t(unsigned int, mtu, IP6_MAX_MTU);
}
static struct dst_entry *icmp6_dst_gc_list;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5ca56cee2dae..e289830ed6e3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -798,7 +798,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
__tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
fl6.flowi6_proto = IPPROTO_TCP;
- if (rt6_need_strict(&fl6.daddr) || !oif)
+ if (rt6_need_strict(&fl6.daddr) && !oif)
fl6.flowi6_oif = inet6_iif(skb);
else
fl6.flowi6_oif = oif;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a5e03119107a..01e77b0ae075 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1757,7 +1757,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
/* Wake up accept */
nsk->sk_state = IUCV_CONNECTED;
- sk->sk_data_ready(sk, 1);
+ sk->sk_data_ready(sk);
err = 0;
fail:
bh_unlock_sock(sk);
@@ -1968,7 +1968,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
if (!err) {
iucv_accept_enqueue(sk, nsk);
nsk->sk_state = IUCV_CONNECTED;
- sk->sk_data_ready(sk, 1);
+ sk->sk_data_ready(sk);
} else
iucv_sock_kill(nsk);
bh_unlock_sock(sk);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index cd5b8ec9be04..da787930df0a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -621,6 +621,42 @@ static void iucv_disable(void)
put_online_cpus();
}
+static void free_iucv_data(int cpu)
+{
+ kfree(iucv_param_irq[cpu]);
+ iucv_param_irq[cpu] = NULL;
+ kfree(iucv_param[cpu]);
+ iucv_param[cpu] = NULL;
+ kfree(iucv_irq_data[cpu]);
+ iucv_irq_data[cpu] = NULL;
+}
+
+static int alloc_iucv_data(int cpu)
+{
+ /* Note: GFP_DMA used to get memory below 2G */
+ iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
+ GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+ if (!iucv_irq_data[cpu])
+ goto out_free;
+
+ /* Allocate parameter blocks. */
+ iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
+ GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+ if (!iucv_param[cpu])
+ goto out_free;
+
+ iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
+ GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+ if (!iucv_param_irq[cpu])
+ goto out_free;
+
+ return 0;
+
+out_free:
+ free_iucv_data(cpu);
+ return -ENOMEM;
+}
+
static int iucv_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -630,38 +666,14 @@ static int iucv_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
- GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
- if (!iucv_irq_data[cpu])
- return notifier_from_errno(-ENOMEM);
-
- iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
- GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
- if (!iucv_param[cpu]) {
- kfree(iucv_irq_data[cpu]);
- iucv_irq_data[cpu] = NULL;
+ if (alloc_iucv_data(cpu))
return notifier_from_errno(-ENOMEM);
- }
- iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
- GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
- if (!iucv_param_irq[cpu]) {
- kfree(iucv_param[cpu]);
- iucv_param[cpu] = NULL;
- kfree(iucv_irq_data[cpu]);
- iucv_irq_data[cpu] = NULL;
- return notifier_from_errno(-ENOMEM);
- }
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- kfree(iucv_param_irq[cpu]);
- iucv_param_irq[cpu] = NULL;
- kfree(iucv_param[cpu]);
- iucv_param[cpu] = NULL;
- kfree(iucv_irq_data[cpu]);
- iucv_irq_data[cpu] = NULL;
+ free_iucv_data(cpu);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -2016,7 +2028,7 @@ static int __init iucv_init(void)
rc = iucv_query_maxconn();
if (rc)
goto out_ctl;
- rc = register_external_interrupt(0x4000, iucv_external_interrupt);
+ rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
if (rc)
goto out_ctl;
iucv_root = root_device_register("iucv");
@@ -2025,33 +2037,20 @@ static int __init iucv_init(void)
goto out_int;
}
- for_each_online_cpu(cpu) {
- /* Note: GFP_DMA used to get memory below 2G */
- iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
- GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
- if (!iucv_irq_data[cpu]) {
- rc = -ENOMEM;
- goto out_free;
- }
+ cpu_notifier_register_begin();
- /* Allocate parameter blocks. */
- iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
- GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
- if (!iucv_param[cpu]) {
- rc = -ENOMEM;
- goto out_free;
- }
- iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
- GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
- if (!iucv_param_irq[cpu]) {
+ for_each_online_cpu(cpu) {
+ if (alloc_iucv_data(cpu)) {
rc = -ENOMEM;
goto out_free;
}
-
}
- rc = register_hotcpu_notifier(&iucv_cpu_notifier);
+ rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
if (rc)
goto out_free;
+
+ cpu_notifier_register_done();
+
rc = register_reboot_notifier(&iucv_reboot_notifier);
if (rc)
goto out_cpu;
@@ -2069,19 +2068,17 @@ static int __init iucv_init(void)
out_reboot:
unregister_reboot_notifier(&iucv_reboot_notifier);
out_cpu:
- unregister_hotcpu_notifier(&iucv_cpu_notifier);
+ cpu_notifier_register_begin();
+ __unregister_hotcpu_notifier(&iucv_cpu_notifier);
out_free:
- for_each_possible_cpu(cpu) {
- kfree(iucv_param_irq[cpu]);
- iucv_param_irq[cpu] = NULL;
- kfree(iucv_param[cpu]);
- iucv_param[cpu] = NULL;
- kfree(iucv_irq_data[cpu]);
- iucv_irq_data[cpu] = NULL;
- }
+ for_each_possible_cpu(cpu)
+ free_iucv_data(cpu);
+
+ cpu_notifier_register_done();
+
root_device_unregister(iucv_root);
out_int:
- unregister_external_interrupt(0x4000, iucv_external_interrupt);
+ unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
out_ctl:
ctl_clear_bit(0, 1);
out:
@@ -2105,18 +2102,14 @@ static void __exit iucv_exit(void)
kfree(p);
spin_unlock_irq(&iucv_queue_lock);
unregister_reboot_notifier(&iucv_reboot_notifier);
- unregister_hotcpu_notifier(&iucv_cpu_notifier);
- for_each_possible_cpu(cpu) {
- kfree(iucv_param_irq[cpu]);
- iucv_param_irq[cpu] = NULL;
- kfree(iucv_param[cpu]);
- iucv_param[cpu] = NULL;
- kfree(iucv_irq_data[cpu]);
- iucv_irq_data[cpu] = NULL;
- }
+ cpu_notifier_register_begin();
+ __unregister_hotcpu_notifier(&iucv_cpu_notifier);
+ for_each_possible_cpu(cpu)
+ free_iucv_data(cpu);
+ cpu_notifier_register_done();
root_device_unregister(iucv_root);
bus_unregister(&iucv_bus);
- unregister_external_interrupt(0x4000, iucv_external_interrupt);
+ unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
}
subsys_initcall(iucv_init);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index e72589a8400d..f3c83073afc4 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -205,7 +205,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
skb_set_owner_r(*skb2, sk);
skb_queue_tail(&sk->sk_receive_queue, *skb2);
- sk->sk_data_ready(sk, (*skb2)->len);
+ sk->sk_data_ready(sk);
*skb2 = NULL;
err = 0;
}
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index d276e2d4a589..950909f04ee6 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -753,9 +753,9 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
session->deref = pppol2tp_session_sock_put;
/* If PMTU discovery was enabled, use the MTU that was discovered */
- dst = sk_dst_get(sk);
+ dst = sk_dst_get(tunnel->sock);
if (dst != NULL) {
- u32 pmtu = dst_mtu(__sk_dst_get(sk));
+ u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock));
if (pmtu != 0)
session->mtu = session->mru = pmtu -
PPPOL2TP_HEADER_OVERHEAD;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c2d585c4f7c5..894cda0206bb 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1653,7 +1653,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
else
#endif /* CONFIG_NETLINK_MMAP */
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, len);
+ sk->sk_data_ready(sk);
return len;
}
@@ -2394,7 +2394,7 @@ out:
return err ? : copied;
}
-static void netlink_data_ready(struct sock *sk, int len)
+static void netlink_data_ready(struct sock *sk)
{
BUG();
}
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index b74aa0755521..ede50d197e10 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1011,7 +1011,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
skb_queue_head(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
bh_unlock_sock(sk);
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index b486f12ae243..b4671958fcf9 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -976,7 +976,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
new_sk->sk_state = LLCP_CONNECTED;
/* Wake the listening processes */
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
/* Send CC */
nfc_llcp_send_cc(new_sock);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 72e0c71fb01d..b85c67ccb797 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1848,7 +1848,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dropcount = atomic_read(&sk->sk_drops);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 0;
drop_n_acct:
@@ -2054,7 +2054,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
else
prb_clear_blk_fill_status(&po->rx_ring);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
@@ -2069,7 +2069,7 @@ ring_is_full:
po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
kfree_skb(copy_skb);
goto drop_n_restore;
}
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index a2fba7edfd1f..66dc65e7c6a1 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -37,7 +37,7 @@
struct gprs_dev {
struct sock *sk;
void (*old_state_change)(struct sock *);
- void (*old_data_ready)(struct sock *, int);
+ void (*old_data_ready)(struct sock *);
void (*old_write_space)(struct sock *);
struct net_device *dev;
@@ -146,7 +146,7 @@ drop:
return err;
}
-static void gprs_data_ready(struct sock *sk, int len)
+static void gprs_data_ready(struct sock *sk)
{
struct gprs_dev *gp = sk->sk_user_data;
struct sk_buff *skb;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index e77411735de8..70a547ea5177 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -462,10 +462,9 @@ out:
queue:
skb->dev = NULL;
skb_set_owner_r(skb, sk);
- err = skb->len;
skb_queue_tail(queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, err);
+ sk->sk_data_ready(sk);
return NET_RX_SUCCESS;
}
@@ -587,10 +586,9 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
pn->rx_credits--;
skb->dev = NULL;
skb_set_owner_r(skb, sk);
- err = skb->len;
skb_queue_tail(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, err);
+ sk->sk_data_ready(sk);
return NET_RX_SUCCESS;
case PNS_PEP_CONNECT_RESP:
@@ -698,7 +696,7 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
skb_queue_head(&sk->sk_receive_queue, skb);
sk_acceptq_added(sk);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return NET_RX_SUCCESS;
case PNS_PEP_DISCONNECT_REQ:
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 9cf2927d0021..65637491f728 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -61,12 +61,12 @@ void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
int rds_tcp_listen_init(void);
void rds_tcp_listen_stop(void);
-void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
+void rds_tcp_listen_data_ready(struct sock *sk);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void);
-void rds_tcp_data_ready(struct sock *sk, int bytes);
+void rds_tcp_data_ready(struct sock *sk);
int rds_tcp_recv(struct rds_connection *conn);
void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 7787537e9c2e..4e638f851185 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -108,9 +108,9 @@ static void rds_tcp_accept_worker(struct work_struct *work)
cond_resched();
}
-void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
+void rds_tcp_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
rdsdebug("listen data ready sk %p\n", sk);
@@ -132,7 +132,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
int rds_tcp_listen_init(void)
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 4fac4f2bb9dc..9ae6e0a264ec 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -314,13 +314,13 @@ int rds_tcp_recv(struct rds_connection *conn)
return ret;
}
-void rds_tcp_data_ready(struct sock *sk, int bytes)
+void rds_tcp_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
+ rdsdebug("data ready sk %p\n", sk);
read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data;
@@ -337,7 +337,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
int rds_tcp_recv_init(void)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index c2cca2ee6aef..8451c8cdc9de 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1041,7 +1041,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
rose_start_heartbeat(make);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 1;
}
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 73742647c135..63b21e580de9 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -113,7 +113,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
spin_unlock_bh(&sk->sk_receive_queue.lock);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
}
skb = NULL;
} else {
@@ -632,14 +632,14 @@ cant_find_conn:
* handle data received on the local endpoint
* - may be called in interrupt context
*/
-void rxrpc_data_ready(struct sock *sk, int count)
+void rxrpc_data_ready(struct sock *sk)
{
struct rxrpc_skb_priv *sp;
struct rxrpc_local *local;
struct sk_buff *skb;
int ret;
- _enter("%p, %d", sk, count);
+ _enter("%p", sk);
ASSERT(!irqs_disabled());
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index c831d44b0841..ba9fd36d3f15 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -518,7 +518,7 @@ void rxrpc_UDP_error_handler(struct work_struct *);
*/
extern const char *rxrpc_pkts[];
-void rxrpc_data_ready(struct sock *, int);
+void rxrpc_data_ready(struct sock *);
int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 4f6d6f9d1274..39579c3e0d14 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1395,35 +1395,44 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc)
return false;
}
-/* Update asoc's rwnd for the approximated state in the buffer,
- * and check whether SACK needs to be sent.
- */
-void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
+/* Increase asoc's rwnd by len and send any window update SACK if needed. */
+void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
{
- int rx_count;
struct sctp_chunk *sack;
struct timer_list *timer;
- if (asoc->ep->rcvbuf_policy)
- rx_count = atomic_read(&asoc->rmem_alloc);
- else
- rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+ if (asoc->rwnd_over) {
+ if (asoc->rwnd_over >= len) {
+ asoc->rwnd_over -= len;
+ } else {
+ asoc->rwnd += (len - asoc->rwnd_over);
+ asoc->rwnd_over = 0;
+ }
+ } else {
+ asoc->rwnd += len;
+ }
- if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0)
- asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1;
- else
- asoc->rwnd = 0;
+ /* If we had window pressure, start recovering it
+ * once our rwnd had reached the accumulated pressure
+ * threshold. The idea is to recover slowly, but up
+ * to the initial advertised window.
+ */
+ if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
+ int change = min(asoc->pathmtu, asoc->rwnd_press);
+ asoc->rwnd += change;
+ asoc->rwnd_press -= change;
+ }
- pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n",
- __func__, asoc, asoc->rwnd, rx_count,
- asoc->base.sk->sk_rcvbuf);
+ pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n",
+ __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
+ asoc->a_rwnd);
/* Send a window update SACK if the rwnd has increased by at least the
* minimum of the association's PMTU and half of the receive buffer.
* The algorithm used is similar to the one described in
* Section 4.2.3.3 of RFC 1122.
*/
- if (update_peer && sctp_peer_needs_update(asoc)) {
+ if (sctp_peer_needs_update(asoc)) {
asoc->a_rwnd = asoc->rwnd;
pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
@@ -1445,6 +1454,45 @@ void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
}
}
+/* Decrease asoc's rwnd by len. */
+void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
+{
+ int rx_count;
+ int over = 0;
+
+ if (unlikely(!asoc->rwnd || asoc->rwnd_over))
+ pr_debug("%s: association:%p has asoc->rwnd:%u, "
+ "asoc->rwnd_over:%u!\n", __func__, asoc,
+ asoc->rwnd, asoc->rwnd_over);
+
+ if (asoc->ep->rcvbuf_policy)
+ rx_count = atomic_read(&asoc->rmem_alloc);
+ else
+ rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+
+ /* If we've reached or overflowed our receive buffer, announce
+ * a 0 rwnd if rwnd would still be positive. Store the
+ * the potential pressure overflow so that the window can be restored
+ * back to original value.
+ */
+ if (rx_count >= asoc->base.sk->sk_rcvbuf)
+ over = 1;
+
+ if (asoc->rwnd >= len) {
+ asoc->rwnd -= len;
+ if (over) {
+ asoc->rwnd_press += asoc->rwnd;
+ asoc->rwnd = 0;
+ }
+ } else {
+ asoc->rwnd_over = len - asoc->rwnd;
+ asoc->rwnd = 0;
+ }
+
+ pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n",
+ __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
+ asoc->rwnd_press);
+}
/* Build the bind address list for the association based on info from the
* local endpoint and the remote peer.
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 01e002430c85..ae9fbeba40b0 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* PMTU. In cases, such as loopback, this might be a rather
* large spill over.
*/
- if ((!chunk->data_accepted) && (!asoc->rwnd ||
+ if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
(datalen > asoc->rwnd + asoc->frag_point))) {
/* If this is the next TSN, consider reneging to make
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 981aaf8b6ace..ff20e2dbbbc7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
sctp_skb_pull(skb, copied);
skb_queue_head(&sk->sk_receive_queue, skb);
+ /* When only partial message is copied to the user, increase
+ * rwnd by that amount. If all the data in the skb is read,
+ * rwnd is updated when the event is freed.
+ */
+ if (!sctp_ulpevent_is_notification(event))
+ sctp_assoc_rwnd_increase(event->asoc, copied);
goto out;
} else if ((event->msg_flags & MSG_NOTIFICATION) ||
(event->msg_flags & MSG_EOR))
@@ -6593,6 +6599,46 @@ static void __sctp_write_space(struct sctp_association *asoc)
}
}
+static void sctp_wake_up_waiters(struct sock *sk,
+ struct sctp_association *asoc)
+{
+ struct sctp_association *tmp = asoc;
+
+ /* We do accounting for the sndbuf space per association,
+ * so we only need to wake our own association.
+ */
+ if (asoc->ep->sndbuf_policy)
+ return __sctp_write_space(asoc);
+
+ /* If association goes down and is just flushing its
+ * outq, then just normally notify others.
+ */
+ if (asoc->base.dead)
+ return sctp_write_space(sk);
+
+ /* Accounting for the sndbuf space is per socket, so we
+ * need to wake up others, try to be fair and in case of
+ * other associations, let them have a go first instead
+ * of just doing a sctp_write_space() call.
+ *
+ * Note that we reach sctp_wake_up_waiters() only when
+ * associations free up queued chunks, thus we are under
+ * lock and the list of associations on a socket is
+ * guaranteed not to change.
+ */
+ for (tmp = list_next_entry(tmp, asocs); 1;
+ tmp = list_next_entry(tmp, asocs)) {
+ /* Manually skip the head element. */
+ if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs))
+ continue;
+ /* Wake up association. */
+ __sctp_write_space(tmp);
+ /* We've reached the end. */
+ if (tmp == asoc)
+ break;
+ }
+}
+
/* Do accounting for the sndbuf space.
* Decrement the used sndbuf space of the corresponding association by the
* data size which was just transmitted(freed).
@@ -6620,7 +6666,7 @@ static void sctp_wfree(struct sk_buff *skb)
sk_mem_uncharge(sk, skb->truesize);
sock_wfree(skb);
- __sctp_write_space(asoc);
+ sctp_wake_up_waiters(sk, asoc);
sctp_association_put(asoc);
}
@@ -6705,7 +6751,7 @@ do_nonblock:
goto out;
}
-void sctp_data_ready(struct sock *sk, int len)
+void sctp_data_ready(struct sock *sk)
{
struct socket_wq *wq;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 8d198ae03606..85c64658bd0b 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
skb = sctp_event2skb(event);
/* Set the owner and charge rwnd for bytes received. */
sctp_ulpevent_set_owner(event, asoc);
- sctp_assoc_rwnd_update(asoc, false);
+ sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));
if (!skb->data_len)
return;
@@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
{
struct sk_buff *skb, *frag;
unsigned int len;
- struct sctp_association *asoc;
/* Current stack structures assume that the rcv buffer is
* per socket. For UDP style sockets this is not true as
@@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
}
done:
- asoc = event->asoc;
- sctp_association_hold(asoc);
+ sctp_assoc_rwnd_increase(event->asoc, len);
sctp_ulpevent_release_owner(event);
- sctp_assoc_rwnd_update(asoc, true);
- sctp_association_put(asoc);
}
static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 5dc94117e9d4..7144eb6a1b95 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -259,7 +259,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
sctp_ulpq_clear_pd(ulpq);
if (queue == &sk->sk_receive_queue)
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return 1;
out_free:
@@ -1135,5 +1135,5 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
/* If there is data waiting, send it up the socket now. */
if (sctp_ulpq_clear_pd(ulpq) || ev)
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 241b54f30204..0754d0f466d2 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -9,19 +9,6 @@ config SUNRPC_BACKCHANNEL
bool
depends on SUNRPC
-config SUNRPC_XPRT_RDMA
- tristate
- depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
- default SUNRPC && INFINIBAND
- help
- This option allows the NFS client and server to support
- an RDMA-enabled transport.
-
- To compile RPC client RDMA transport support as a module,
- choose M here: the module will be called xprtrdma.
-
- If unsure, say N.
-
config SUNRPC_SWAP
bool
depends on SUNRPC
@@ -57,3 +44,29 @@ config SUNRPC_DEBUG
but makes troubleshooting NFS issues significantly harder.
If unsure, say Y.
+
+config SUNRPC_XPRT_RDMA_CLIENT
+ tristate "RPC over RDMA Client Support"
+ depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default SUNRPC && INFINIBAND
+ help
+ This option allows the NFS client to support an RDMA-enabled
+ transport.
+
+ To compile RPC client RDMA transport support as a module,
+ choose M here: the module will be called xprtrdma.
+
+ If unsure, say N.
+
+config SUNRPC_XPRT_RDMA_SERVER
+ tristate "RPC over RDMA Server Support"
+ depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default SUNRPC && INFINIBAND
+ help
+ This option allows the NFS server to support an RDMA-enabled
+ transport.
+
+ To compile RPC server RDMA transport support as a module,
+ choose M here: the module will be called svcrdma.
+
+ If unsure, say N.
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8209a0411bca..e5a7a1cac8f3 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,7 +5,8 @@
obj-$(CONFIG_SUNRPC) += sunrpc.o
obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
+
+obj-y += xprtrdma/
sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
auth.o auth_null.o auth_unix.o auth_generic.o \
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index e860d4f7ed2a..3513d559bc45 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -212,39 +212,23 @@ out:
}
EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
-/*
- * One or more rpc_rqst structure have been preallocated during the
- * backchannel setup. Buffer space for the send and private XDR buffers
- * has been preallocated as well. Use xprt_alloc_bc_request to allocate
- * to this request. Use xprt_free_bc_request to return it.
- *
- * We know that we're called in soft interrupt context, grab the spin_lock
- * since there is no need to grab the bottom half spin_lock.
- *
- * Return an available rpc_rqst, otherwise NULL if non are available.
- */
-struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
+static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
{
- struct rpc_rqst *req;
+ struct rpc_rqst *req = NULL;
dprintk("RPC: allocate a backchannel request\n");
- spin_lock(&xprt->bc_pa_lock);
- if (!list_empty(&xprt->bc_pa_list)) {
- req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
- rq_bc_pa_list);
- list_del(&req->rq_bc_pa_list);
- } else {
- req = NULL;
- }
- spin_unlock(&xprt->bc_pa_lock);
+ if (list_empty(&xprt->bc_pa_list))
+ goto not_found;
- if (req != NULL) {
- set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
- req->rq_reply_bytes_recvd = 0;
- req->rq_bytes_sent = 0;
- memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+ req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
+ rq_bc_pa_list);
+ req->rq_reply_bytes_recvd = 0;
+ req->rq_bytes_sent = 0;
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
sizeof(req->rq_private_buf));
- }
+ req->rq_xid = xid;
+ req->rq_connect_cookie = xprt->connect_cookie;
+not_found:
dprintk("RPC: backchannel req=%p\n", req);
return req;
}
@@ -259,6 +243,7 @@ void xprt_free_bc_request(struct rpc_rqst *req)
dprintk("RPC: free backchannel req=%p\n", req);
+ req->rq_connect_cookie = xprt->connect_cookie - 1;
smp_mb__before_clear_bit();
WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
@@ -281,7 +266,57 @@ void xprt_free_bc_request(struct rpc_rqst *req)
* may be reused by a new callback request.
*/
spin_lock_bh(&xprt->bc_pa_lock);
- list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+ list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
spin_unlock_bh(&xprt->bc_pa_lock);
}
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup. Buffer space for the send and private XDR buffers
+ * has been preallocated as well. Use xprt_alloc_bc_request to allocate
+ * to this request. Use xprt_free_bc_request to return it.
+ *
+ * We know that we're called in soft interrupt context, grab the spin_lock
+ * since there is no need to grab the bottom half spin_lock.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
+{
+ struct rpc_rqst *req;
+
+ spin_lock(&xprt->bc_pa_lock);
+ list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
+ if (req->rq_connect_cookie != xprt->connect_cookie)
+ continue;
+ if (req->rq_xid == xid)
+ goto found;
+ }
+ req = xprt_alloc_bc_request(xprt, xid);
+found:
+ spin_unlock(&xprt->bc_pa_lock);
+ return req;
+}
+
+/*
+ * Add callback request to callback list. The callback
+ * service sleeps on the sv_cb_waitq waiting for new
+ * requests. Wake it up after adding enqueing the
+ * request.
+ */
+void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct svc_serv *bc_serv = xprt->bc_serv;
+
+ req->rq_private_buf.len = copied;
+ set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+
+ dprintk("RPC: add callback request to list\n");
+ spin_lock(&bc_serv->sv_cb_lock);
+ list_del(&req->rq_bc_pa_list);
+ list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
+ wake_up(&bc_serv->sv_cb_waitq);
+ spin_unlock(&bc_serv->sv_cb_lock);
+}
+
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0edada973434..2e6ab10734f6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -438,6 +438,38 @@ out_no_rpciod:
return ERR_PTR(err);
}
+struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+ struct rpc_xprt *xprt)
+{
+ struct rpc_clnt *clnt = NULL;
+
+ clnt = rpc_new_client(args, xprt, NULL);
+ if (IS_ERR(clnt))
+ return clnt;
+
+ if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
+ int err = rpc_ping(clnt);
+ if (err != 0) {
+ rpc_shutdown_client(clnt);
+ return ERR_PTR(err);
+ }
+ }
+
+ clnt->cl_softrtry = 1;
+ if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+ clnt->cl_softrtry = 0;
+
+ if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
+ clnt->cl_autobind = 1;
+ if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
+ clnt->cl_discrtry = 1;
+ if (!(args->flags & RPC_CLNT_CREATE_QUIET))
+ clnt->cl_chatty = 1;
+
+ return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_create_xprt);
+
/**
* rpc_create - create an RPC client and transport with one call
* @args: rpc_clnt create argument structure
@@ -451,7 +483,6 @@ out_no_rpciod:
struct rpc_clnt *rpc_create(struct rpc_create_args *args)
{
struct rpc_xprt *xprt;
- struct rpc_clnt *clnt;
struct xprt_create xprtargs = {
.net = args->net,
.ident = args->protocol,
@@ -515,30 +546,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
- clnt = rpc_new_client(args, xprt, NULL);
- if (IS_ERR(clnt))
- return clnt;
-
- if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
- int err = rpc_ping(clnt);
- if (err != 0) {
- rpc_shutdown_client(clnt);
- return ERR_PTR(err);
- }
- }
-
- clnt->cl_softrtry = 1;
- if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
- clnt->cl_softrtry = 0;
-
- if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
- clnt->cl_autobind = 1;
- if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
- clnt->cl_discrtry = 1;
- if (!(args->flags & RPC_CLNT_CREATE_QUIET))
- clnt->cl_chatty = 1;
-
- return clnt;
+ return rpc_create_xprt(args, xprt);
}
EXPORT_SYMBOL_GPL(rpc_create);
@@ -1363,6 +1371,7 @@ rpc_restart_call_prepare(struct rpc_task *task)
if (RPC_ASSASSINATED(task))
return 0;
task->tk_action = call_start;
+ task->tk_status = 0;
if (task->tk_ops->rpc_call_prepare != NULL)
task->tk_action = rpc_prepare_task;
return 1;
@@ -1379,6 +1388,7 @@ rpc_restart_call(struct rpc_task *task)
if (RPC_ASSASSINATED(task))
return 0;
task->tk_action = call_start;
+ task->tk_status = 0;
return 1;
}
EXPORT_SYMBOL_GPL(rpc_restart_call);
@@ -1728,9 +1738,7 @@ call_bind_status(struct rpc_task *task)
case -EPROTONOSUPPORT:
dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
task->tk_pid);
- task->tk_status = 0;
- task->tk_action = call_bind;
- return;
+ goto retry_timeout;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
case -ECONNABORTED:
@@ -1756,6 +1764,7 @@ call_bind_status(struct rpc_task *task)
return;
retry_timeout:
+ task->tk_status = 0;
task->tk_action = call_timeout;
}
@@ -1798,21 +1807,19 @@ call_connect_status(struct rpc_task *task)
trace_rpc_connect_status(task, status);
task->tk_status = 0;
switch (status) {
- /* if soft mounted, test if we've timed out */
- case -ETIMEDOUT:
- task->tk_action = call_timeout;
- return;
case -ECONNREFUSED:
case -ECONNRESET:
case -ECONNABORTED:
case -ENETUNREACH:
case -EHOSTUNREACH:
- /* retry with existing socket, after a delay */
- rpc_delay(task, 3*HZ);
if (RPC_IS_SOFTCONN(task))
break;
+ /* retry with existing socket, after a delay */
+ rpc_delay(task, 3*HZ);
case -EAGAIN:
- task->tk_action = call_bind;
+ /* Check for timeouts before looping back to call_bind */
+ case -ETIMEDOUT:
+ task->tk_action = call_timeout;
return;
case 0:
clnt->cl_stats->netreconn++;
@@ -2007,6 +2014,10 @@ call_status(struct rpc_task *task)
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
+ if (RPC_IS_SOFTCONN(task)) {
+ rpc_exit(task, status);
+ break;
+ }
/*
* Delay any retries for 3 seconds, then handle as if it
* were a timeout.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ff3cc4bf4b24..25578afe1548 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -637,7 +637,8 @@ static void __rpc_queue_timer_fn(unsigned long ptr)
static void __rpc_atrun(struct rpc_task *task)
{
- task->tk_status = 0;
+ if (task->tk_status == -ETIMEDOUT)
+ task->tk_status = 0;
}
/*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b6e59f0a9475..43bcb4699d69 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -60,7 +60,7 @@
static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
int flags);
-static void svc_udp_data_ready(struct sock *, int);
+static void svc_udp_data_ready(struct sock *);
static int svc_udp_recvfrom(struct svc_rqst *);
static int svc_udp_sendto(struct svc_rqst *);
static void svc_sock_detach(struct svc_xprt *);
@@ -403,14 +403,14 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
/*
* INET callback when data has been received on the socket.
*/
-static void svc_udp_data_ready(struct sock *sk, int count)
+static void svc_udp_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
- dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
- svsk, sk, count,
+ dprintk("svc: socket %p(inet %p), busy=%d\n",
+ svsk, sk,
test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
@@ -731,7 +731,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
* A data_ready event on a listening socket means there's a connection
* pending. Do not use state_change as a substitute for it.
*/
-static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+static void svc_tcp_listen_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
wait_queue_head_t *wq;
@@ -783,7 +783,7 @@ static void svc_tcp_state_change(struct sock *sk)
wake_up_interruptible_all(wq);
}
-static void svc_tcp_data_ready(struct sock *sk, int count)
+static void svc_tcp_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
wait_queue_head_t *wq = sk_sleep(sk);
@@ -1397,6 +1397,22 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
return svsk;
}
+bool svc_alien_sock(struct net *net, int fd)
+{
+ int err;
+ struct socket *sock = sockfd_lookup(fd, &err);
+ bool ret = false;
+
+ if (!sock)
+ goto out;
+ if (sock_net(sock->sk) != net)
+ ret = true;
+ sockfd_put(sock);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(svc_alien_sock);
+
/**
* svc_addsock - add a listener socket to an RPC service
* @serv: pointer to RPC service to which to add a new listener
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 1504bb11e4f3..dd97ba3c4456 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -833,8 +833,20 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
}
EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
-/* Sets subbuf to the portion of buf of length len beginning base bytes
- * from the start of buf. Returns -1 if base of length are out of bounds. */
+/**
+ * xdr_buf_subsegment - set subbuf to a portion of buf
+ * @buf: an xdr buffer
+ * @subbuf: the result buffer
+ * @base: beginning of range in bytes
+ * @len: length of range in bytes
+ *
+ * sets @subbuf to an xdr buffer representing the portion of @buf of
+ * length @len starting at offset @base.
+ *
+ * @buf and @subbuf may be pointers to the same struct xdr_buf.
+ *
+ * Returns -1 if base of length are out of bounds.
+ */
int
xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
unsigned int base, unsigned int len)
@@ -847,9 +859,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
len -= subbuf->head[0].iov_len;
base = 0;
} else {
- subbuf->head[0].iov_base = NULL;
- subbuf->head[0].iov_len = 0;
base -= buf->head[0].iov_len;
+ subbuf->head[0].iov_len = 0;
}
if (base < buf->page_len) {
@@ -871,9 +882,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
len -= subbuf->tail[0].iov_len;
base = 0;
} else {
- subbuf->tail[0].iov_base = NULL;
- subbuf->tail[0].iov_len = 0;
base -= buf->tail[0].iov_len;
+ subbuf->tail[0].iov_len = 0;
}
if (base || len)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 7d4df99f761f..d173f79947c6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1383,15 +1383,3 @@ void xprt_put(struct rpc_xprt *xprt)
if (atomic_dec_and_test(&xprt->count))
xprt_destroy(xprt);
}
-
-/**
- * xprt_get - return a reference to an RPC transport.
- * @xprt: pointer to the transport
- *
- */
-struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
-{
- if (atomic_inc_not_zero(&xprt->count))
- return xprt;
- return NULL;
-}
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 5a8f268bdd30..da5136fd5694 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,8 +1,8 @@
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
xprtrdma-y := transport.o rpc_rdma.o verbs.o
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
svcrdma-y := svc_rdma.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e03725bfe2b8..96ead526b125 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
break;
page_base = 0;
}
- rqst->rq_rcv_buf.page_len = olen - copy_len;
- } else
- rqst->rq_rcv_buf.page_len = 0;
+ }
if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
curlen = copy_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0ce75524ed21..8d904e4eef15 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
sge_no++;
}
rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
/* We should never run out of SGE because the limit is defined to
* support the max allowed RPC data length
@@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt,
*/
head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
byte_count -= sge_bytes;
ch_bytes -= sge_bytes;
@@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
/* rq_respages points one past arg pages */
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
/* Create the reply and chunk maps */
offset = 0;
@@ -520,13 +523,6 @@ next_sge:
for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
rqstp->rq_pages[ch_no] = NULL;
- /*
- * Detach res pages. If svc_release sees any it will attempt to
- * put them.
- */
- while (rqstp->rq_next_page != rqstp->rq_respages)
- *(--rqstp->rq_next_page) = NULL;
-
return err;
}
@@ -550,7 +546,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
/* rq_respages starts after the last arg page */
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
/* Rebuild rq_arg head and tail. */
rqstp->rq_arg.head[0] = head->arg.head[0];
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c1d124dc772b..7e024a51617e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
xdr_off -= xdr->head[0].iov_len;
if (xdr_off < xdr->page_len) {
/* This offset is in the page list */
+ xdr_off += xdr->page_base;
page = xdr->pages[xdr_off >> PAGE_SHIFT];
xdr_off &= ~PAGE_MASK;
} else {
@@ -625,6 +626,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
if (page_no+1 >= sge_no)
ctxt->sge[page_no+1].length = 0;
}
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
BUG_ON(sge_no > rdma->sc_max_sge);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 62e4f9bcc387..25688fa2207f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void)
while ((page = alloc_page(GFP_KERNEL)) == NULL) {
/* If we can't get memory, wait a bit and try again */
- printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
- "jiffies.\n");
+ printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
}
return page;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 285dc0884115..1eb9c468d0c9 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void)
{
int rc;
- dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+ dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
#ifdef RPC_DEBUG
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
@@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void)
if (rc)
return rc;
- dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
+ dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
- dprintk(KERN_INFO "Defaults:\n");
- dprintk(KERN_INFO "\tSlots %d\n"
+ dprintk("Defaults:\n");
+ dprintk("\tSlots %d\n"
"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
xprt_rdma_slot_table_entries,
xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
- dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
+ dprintk("\tPadding %d\n\tMemreg %d\n",
xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
#ifdef RPC_DEBUG
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0addefca8e77..25a3dcf15cae 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -254,7 +254,7 @@ struct sock_xprt {
/*
* Saved socket callback addresses
*/
- void (*old_data_ready)(struct sock *, int);
+ void (*old_data_ready)(struct sock *);
void (*old_state_change)(struct sock *);
void (*old_write_space)(struct sock *);
void (*old_error_report)(struct sock *);
@@ -909,6 +909,12 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
xs_tcp_shutdown(xprt);
}
+static void xs_xprt_free(struct rpc_xprt *xprt)
+{
+ xs_free_peer_addresses(xprt);
+ xprt_free(xprt);
+}
+
/**
* xs_destroy - prepare to shutdown a transport
* @xprt: doomed transport
@@ -919,8 +925,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
dprintk("RPC: xs_destroy xprt %p\n", xprt);
xs_close(xprt);
- xs_free_peer_addresses(xprt);
- xprt_free(xprt);
+ xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
@@ -946,7 +951,7 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
*
* Currently this assumes we can read the whole reply in a single gulp.
*/
-static void xs_local_data_ready(struct sock *sk, int len)
+static void xs_local_data_ready(struct sock *sk)
{
struct rpc_task *task;
struct rpc_xprt *xprt;
@@ -1009,7 +1014,7 @@ static void xs_local_data_ready(struct sock *sk, int len)
* @len: how much data to read
*
*/
-static void xs_udp_data_ready(struct sock *sk, int len)
+static void xs_udp_data_ready(struct sock *sk)
{
struct rpc_task *task;
struct rpc_xprt *xprt;
@@ -1306,41 +1311,29 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
* If we're unable to obtain the rpc_rqst we schedule the closing of the
* connection and return -1.
*/
-static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
+static int xs_tcp_read_callback(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
{
struct sock_xprt *transport =
container_of(xprt, struct sock_xprt, xprt);
struct rpc_rqst *req;
- req = xprt_alloc_bc_request(xprt);
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->transport_lock);
+ req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
+ spin_unlock(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
}
- req->rq_xid = transport->tcp_xid;
dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid));
xs_tcp_read_common(xprt, desc, req);
- if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
- struct svc_serv *bc_serv = xprt->bc_serv;
-
- /*
- * Add callback request to callback list. The callback
- * service sleeps on the sv_cb_waitq waiting for new
- * requests. Wake it up after adding enqueing the
- * request.
- */
- dprintk("RPC: add callback request to list\n");
- spin_lock(&bc_serv->sv_cb_lock);
- list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
- spin_unlock(&bc_serv->sv_cb_lock);
- wake_up(&bc_serv->sv_cb_waitq);
- }
-
- req->rq_private_buf.len = transport->tcp_copied;
+ if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
+ xprt_complete_bc_request(req, transport->tcp_copied);
+ spin_unlock(&xprt->transport_lock);
return 0;
}
@@ -1444,7 +1437,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
* @bytes: how much data to read
*
*/
-static void xs_tcp_data_ready(struct sock *sk, int bytes)
+static void xs_tcp_data_ready(struct sock *sk)
{
struct rpc_xprt *xprt;
read_descriptor_t rd_desc;
@@ -2544,6 +2537,10 @@ static void bc_close(struct rpc_xprt *xprt)
static void bc_destroy(struct rpc_xprt *xprt)
{
+ dprintk("RPC: bc_destroy xprt %p\n", xprt);
+
+ xs_xprt_free(xprt);
+ module_put(THIS_MODULE);
}
static struct rpc_xprt_ops xs_local_ops = {
@@ -2744,7 +2741,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
return xprt;
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
@@ -2822,7 +2819,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
return xprt;
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
@@ -2897,12 +2894,11 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->address_strings[RPC_DISPLAY_ADDR],
xprt->address_strings[RPC_DISPLAY_PROTO]);
-
if (try_module_get(THIS_MODULE))
return xprt;
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
@@ -2919,15 +2915,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
struct svc_sock *bc_sock;
struct rpc_xprt *ret;
- if (args->bc_xprt->xpt_bc_xprt) {
- /*
- * This server connection already has a backchannel
- * transport; we can't create a new one, as we wouldn't
- * be able to match replies based on xid any more. So,
- * reuse the already-existing one:
- */
- return args->bc_xprt->xpt_bc_xprt;
- }
xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
xprt_tcp_slot_table_entries);
if (IS_ERR(xprt))
@@ -2985,13 +2972,14 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
*/
xprt_set_connected(xprt);
-
if (try_module_get(THIS_MODULE))
return xprt;
+
+ args->bc_xprt->xpt_bc_xprt = NULL;
xprt_put(xprt);
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 646a930eefbf..a538a02f869b 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -119,7 +119,7 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
return con;
}
-static void sock_data_ready(struct sock *sk, int unused)
+static void sock_data_ready(struct sock *sk)
{
struct tipc_conn *con;
@@ -297,7 +297,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con)
newcon->usr_data = s->tipc_conn_new(newcon->conid);
/* Wake up receive process in case of 'SYN+' message */
- newsock->sk->sk_data_ready(newsock->sk, 0);
+ newsock->sk->sk_data_ready(newsock->sk);
return ret;
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index adc12e227303..3c0256962f7d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -45,7 +45,7 @@
#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
static int backlog_rcv(struct sock *sk, struct sk_buff *skb);
-static void tipc_data_ready(struct sock *sk, int len);
+static void tipc_data_ready(struct sock *sk);
static void tipc_write_space(struct sock *sk);
static int tipc_release(struct socket *sock);
static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
@@ -1248,7 +1248,7 @@ static void tipc_write_space(struct sock *sk)
* @sk: socket
* @len: the length of messages
*/
-static void tipc_data_ready(struct sock *sk, int len)
+static void tipc_data_ready(struct sock *sk)
{
struct socket_wq *wq;
@@ -1410,7 +1410,7 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
__skb_queue_tail(&sk->sk_receive_queue, buf);
skb_set_owner_r(buf, sk);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
return TIPC_OK;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 94404f19f9de..bb7e8ba821f4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1217,7 +1217,7 @@ restart:
__skb_queue_tail(&other->sk_receive_queue, skb);
spin_unlock(&other->sk_receive_queue.lock);
unix_state_unlock(other);
- other->sk_data_ready(other, 0);
+ other->sk_data_ready(other);
sock_put(other);
return 0;
@@ -1600,7 +1600,7 @@ restart:
if (max_level > unix_sk(other)->recursion_level)
unix_sk(other)->recursion_level = max_level;
unix_state_unlock(other);
- other->sk_data_ready(other, len);
+ other->sk_data_ready(other);
sock_put(other);
scm_destroy(siocb->scm);
return len;
@@ -1706,7 +1706,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
if (max_level > unix_sk(other)->recursion_level)
unix_sk(other)->recursion_level = max_level;
unix_state_unlock(other);
- other->sk_data_ready(other, size);
+ other->sk_data_ready(other);
sent += size;
}
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
index 9a730744e7bc..9b7f207f2bee 100644
--- a/net/vmw_vsock/vmci_transport_notify.c
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -315,7 +315,7 @@ vmci_transport_handle_wrote(struct sock *sk,
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, sent_waiting_read) = false;
#endif
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
index 622bd7aa1016..dc9c7929a2f9 100644
--- a/net/vmw_vsock/vmci_transport_notify_qstate.c
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -92,7 +92,7 @@ vmci_transport_handle_wrote(struct sock *sk,
bool bottom_half,
struct sockaddr_vm *dst, struct sockaddr_vm *src)
{
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
static void vsock_block_update_write_window(struct sock *sk)
@@ -290,7 +290,7 @@ vmci_transport_notify_pkt_recv_post_dequeue(
/* See the comment in
* vmci_transport_notify_pkt_send_post_enqueue().
*/
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
return err;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 6177479c7de9..5ad4418ef093 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1064,7 +1064,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
x25_start_heartbeat(make);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
rc = 1;
sock_put(sk);
out:
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index d1b0dc79bb6f..7ac50098a375 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -79,7 +79,7 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
skb_set_owner_r(skbn, sk);
skb_queue_tail(&sk->sk_receive_queue, skbn);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skbn->len);
+ sk->sk_data_ready(sk);
return 0;
}