summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/net/tcp.h2
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/mptcp/mib.c5
-rw-r--r--net/mptcp/mib.h5
-rw-r--r--net/mptcp/pm_netlink.c39
-rw-r--r--net/mptcp/protocol.c511
-rw-r--r--net/mptcp/protocol.h21
-rw-r--r--net/mptcp/subflow.c99
-rw-r--r--tools/testing/selftests/net/mptcp/Makefile3
-rwxr-xr-xtools/testing/selftests/net/mptcp/simult_flows.sh293
10 files changed, 798 insertions, 182 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e85d564446c6..852f0d71dd40 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1414,6 +1414,8 @@ static inline int tcp_full_space(const struct sock *sk)
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}
+void tcp_cleanup_rbuf(struct sock *sk, int copied);
+
/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
* If 87.5 % (7/8) of the space has been consumed, we want to override
* SO_RCVLOWAT constraint, since we are receiving skbs with too small
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 57a568875539..d3781b6087cb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1527,7 +1527,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 0a6a15f3456d..056986c7a228 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -22,6 +22,11 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL),
+ SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE),
+ SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE),
+ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
+ SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index d7de340fc997..937a177729f1 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -15,6 +15,11 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */
+ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */
+ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */
+ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */
+ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 2c208d2e65cd..6947f4fee6b9 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -23,8 +23,6 @@ static int pm_nl_pernet_id;
struct mptcp_pm_addr_entry {
struct list_head list;
- unsigned int flags;
- int ifindex;
struct mptcp_addr_info addr;
struct rcu_head rcu;
};
@@ -119,7 +117,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
rcu_read_lock();
spin_lock_bh(&msk->join_list_lock);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
/* avoid any address already in use by subflows and
@@ -150,7 +148,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
if (i++ == pos) {
ret = entry;
@@ -210,8 +208,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
msk->pm.subflows++;
check_work_pending(msk);
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect(sk, local->ifindex,
- &local->addr, &remote);
+ __mptcp_subflow_connect(sk, &local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
return;
}
@@ -257,13 +254,13 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
local.family = remote.family;
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ __mptcp_subflow_connect((struct sock *)msk, &local, &remote);
spin_lock_bh(&msk->pm.lock);
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
- return (entry->flags &
+ return (entry->addr.flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
@@ -293,9 +290,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max++;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max++;
entry->addr.id = pernet->next_id++;
@@ -345,8 +342,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
if (!entry)
return -ENOMEM;
- entry->flags = 0;
entry->addr = skc_local;
+ entry->addr.ifindex = 0;
+ entry->addr.flags = 0;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -460,14 +458,17 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
skip_family:
- if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
- entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
+ u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ entry->addr.ifindex = val;
+ }
if (tb[MPTCP_PM_ADDR_ATTR_ID])
entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
- entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+ entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
return 0;
}
@@ -535,9 +536,9 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
ret = -EINVAL;
goto out;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max--;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max--;
pernet->addrs--;
@@ -593,10 +594,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags))
goto nla_put_failure;
- if (entry->ifindex &&
- nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ if (entry->addr.ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex))
goto nla_put_failure;
if (addr->family == AF_INET &&
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 683196225f91..ef0dd2f23482 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -32,6 +32,8 @@ struct mptcp6_sock {
#endif
struct mptcp_skb_cb {
+ u64 map_seq;
+ u64 end_seq;
u32 offset;
};
@@ -110,64 +112,204 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
return 0;
}
-static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb,
- unsigned int offset, size_t copy_len)
+static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
+static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ bool fragstolen;
+ int delta;
+
+ if (MPTCP_SKB_CB(from)->offset ||
+ !skb_try_coalesce(to, from, &fragstolen, &delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ kfree_skb_partial(from, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return true;
+}
+
+static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
+ return false;
+
+ return mptcp_try_coalesce((struct sock *)msk, to, from);
+}
+
+/* "inspired" by tcp_data_queue_ofo(), main differences:
+ * - use mptcp seqs
+ * - don't cope with sacks
+ */
+static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
+ struct rb_node **p, *parent;
+ u64 seq, end_seq, max_seq;
+ struct sk_buff *skb1;
+
+ seq = MPTCP_SKB_CB(skb)->map_seq;
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ max_seq = tcp_space(sk);
+ max_seq = max_seq > 0 ? max_seq + msk->ack_seq : msk->ack_seq;
+
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ if (after64(seq, max_seq)) {
+ /* out of window */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+ return;
+ }
- __skb_unlink(skb, &ssk->sk_receive_queue);
+ p = &msk->out_of_order_queue.rb_node;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
+ if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+ msk->ooo_last_skb = skb;
+ goto end;
+ }
- skb_ext_reset(skb);
- skb_orphan(skb);
- msk->ack_seq += copy_len;
+ /* with 2 subflows, adding at end of ooo queue is quite likely
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ return;
+ }
- tail = skb_peek_tail(&sk->sk_receive_queue);
- if (offset == 0 && tail) {
- bool fragstolen;
- int delta;
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ parent = &msk->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
- if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
- kfree_skb_partial(skb, fragstolen);
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ return;
+ }
+ if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ /* partial overlap:
+ * | skb |
+ * | skb1 |
+ * continue traversing
+ */
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ goto merge_right;
+ }
+ } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
return;
}
+ p = &parent->rb_right;
}
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- MPTCP_SKB_CB(skb)->offset = offset;
-}
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
-static void mptcp_stop_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ break;
+ rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ msk->ooo_last_skb = skb;
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
- mptcp_sk(sk)->timer_ival = 0;
+end:
+ skb_condense(skb);
+ skb_set_owner_r(skb, sk);
}
-/* both sockets must be locked */
-static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
- struct sock *ssk)
+static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb, unsigned int offset,
+ size_t copy_len)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
- /* revalidate data sequence number.
- *
- * mptcp_subflow_data_available() is usually called
- * without msk lock. Its unlikely (but possible)
- * that msk->ack_seq has been advanced since the last
- * call found in-sequence data.
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
*/
- if (likely(dsn == msk->ack_seq))
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+
+ if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ /* in sequence */
+ msk->ack_seq += copy_len;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail && mptcp_try_coalesce(sk, tail, skb))
+ return true;
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
+ } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ mptcp_data_queue_ofo(msk, skb);
+ return false;
+ }
- subflow->data_avail = 0;
- return mptcp_subflow_data_available(ssk);
+ /* old data, keep it simple and drop the whole pkt, sender
+ * will retransmit as needed, if needed.
+ */
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ mptcp_drop(sk, skb);
+ return false;
+}
+
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
}
static void mptcp_check_data_fin_ack(struct sock *sk)
@@ -313,11 +455,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
bool done = false;
- if (!mptcp_subflow_dsn_valid(msk, ssk)) {
- *bytes = 0;
- return false;
- }
-
+ pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -355,9 +493,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (tp->urg_data)
done = true;
- __mptcp_move_skb(msk, ssk, skb, offset, len);
+ if (__mptcp_move_skb(msk, ssk, skb, offset, len))
+ moved += len;
seq += len;
- moved += len;
if (WARN_ON_ONCE(map_remaining < len))
break;
@@ -376,20 +514,56 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
} while (more_data_avail);
- *bytes = moved;
-
- /* If the moves have caught up with the DATA_FIN sequence number
- * it's time to ack the DATA_FIN and change socket state, but
- * this is not a good place to change state. Let the workqueue
- * do it.
- */
- if (mptcp_pending_data_fin(sk, NULL) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+ *bytes += moved;
+ if (moved)
+ tcp_cleanup_rbuf(ssk, moved);
return done;
}
+static bool mptcp_ofo_queue(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb, *tail;
+ bool moved = false;
+ struct rb_node *p;
+ u64 end_seq;
+
+ p = rb_first(&msk->out_of_order_queue);
+ pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ while (p) {
+ skb = rb_to_skb(p);
+ if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
+ break;
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &msk->out_of_order_queue);
+
+ if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
+ msk->ack_seq))) {
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ continue;
+ }
+
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
+ int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ /* skip overlapping data, if any */
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
+ delta);
+ MPTCP_SKB_CB(skb)->offset += delta;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+ msk->ack_seq = end_seq;
+ moved = true;
+ }
+ return moved;
+}
+
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
@@ -405,8 +579,19 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
return false;
/* must re-check after taking the lock */
- if (!READ_ONCE(sk->sk_lock.owned))
+ if (!READ_ONCE(sk->sk_lock.owned)) {
__mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_ofo_queue(msk);
+
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+ }
spin_unlock_bh(&sk->sk_lock.slock);
@@ -415,9 +600,17 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool wake;
- set_bit(MPTCP_DATA_READY, &msk->flags);
+ /* move_skbs_to_msk below can legitly clear the data_avail flag,
+ * but we will need later to properly woke the reader, cache its
+ * value
+ */
+ wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
+ if (wake)
+ set_bit(MPTCP_DATA_READY, &msk->flags);
if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
move_skbs_to_msk(msk, ssk))
@@ -438,7 +631,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
move_skbs_to_msk(msk, ssk);
}
wake:
- sk->sk_data_ready(sk);
+ if (wake)
+ sk->sk_data_ready(sk);
}
static void __mptcp_flush_join_list(struct mptcp_sock *msk)
@@ -472,7 +666,7 @@ void mptcp_data_acked(struct sock *sk)
{
mptcp_reset_timer(sk);
- if ((!sk_stream_is_writeable(sk) ||
+ if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
(inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk);
@@ -567,6 +761,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
+static bool mptcp_is_writeable(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!sk_stream_is_writeable((struct sock *)msk))
+ return false;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (sk_stream_is_writeable(subflow->tcp_sock))
+ return true;
+ }
+ return false;
+}
+
static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -609,8 +817,15 @@ out:
sk_mem_reclaim_partial(sk);
/* Only wake up writers if a subflow is ready */
- if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ if (mptcp_is_writeable(msk)) {
+ set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
+ smp_mb__after_atomic();
+
+ /* set SEND_SPACE before sk_stream_write_space clears
+ * NOSPACE
+ */
sk_stream_write_space(sk);
+ }
}
}
@@ -801,60 +1016,128 @@ out:
return ret;
}
-static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+static void mptcp_nospace(struct mptcp_sock *msk)
{
+ struct mptcp_subflow_context *subflow;
+
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+ /* enables ssk->write_space() callbacks */
+ if (sock)
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
}
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ /* only send if our side has not closed yet */
+ return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
+
+struct subflow_send_info {
+ struct sock *ssk;
+ u64 ratio;
+};
+
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
+ u32 *sndbuf)
+{
+ struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
- struct sock *backup = NULL;
+ int i, nr_active = 0;
+ struct sock *ssk;
+ u64 ratio;
+ u32 pace;
- sock_owned_by_me((const struct sock *)msk);
+ sock_owned_by_me((struct sock *)msk);
+ *sndbuf = 0;
if (!mptcp_ext_cache_refill(msk))
return NULL;
+ if (__mptcp_check_fallback(msk)) {
+ if (!msk->first)
+ return NULL;
+ *sndbuf = msk->first->sk_sndbuf;
+ return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ }
+
+ /* re-use last subflow, if the burst allow that */
+ if (msk->last_snd && msk->snd_burst > 0 &&
+ sk_stream_memory_free(msk->last_snd) &&
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ }
+ return msk->last_snd;
+ }
+
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < 2; ++i) {
+ send_info[i].ssk = NULL;
+ send_info[i].ratio = -1;
+ }
mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
- if (!sk_stream_memory_free(ssk)) {
- struct socket *sock = ssk->sk_socket;
+ nr_active += !subflow->backup;
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ if (!sk_stream_memory_free(subflow->tcp_sock))
+ continue;
- if (sock)
- mptcp_nospace(msk, sock);
+ pace = READ_ONCE(ssk->sk_pacing_rate);
+ if (!pace)
+ continue;
- return NULL;
+ ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
+ pace);
+ if (ratio < send_info[subflow->backup].ratio) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].ratio = ratio;
}
+ }
- if (subflow->backup) {
- if (!backup)
- backup = ssk;
+ pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
+ msk, nr_active, send_info[0].ssk, send_info[0].ratio,
+ send_info[1].ssk, send_info[1].ratio);
- continue;
- }
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[0].ssk = send_info[1].ssk;
- return ssk;
+ if (send_info[0].ssk) {
+ msk->last_snd = send_info[0].ssk;
+ msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
+ sk_stream_wspace(msk->last_snd));
+ return msk->last_snd;
}
-
- return backup;
+ return NULL;
}
-static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+static void ssk_check_wmem(struct mptcp_sock *msk)
{
- struct socket *sock;
-
- if (likely(sk_stream_is_writeable(ssk)))
- return;
-
- sock = READ_ONCE(ssk->sk_socket);
- if (sock)
- mptcp_nospace(msk, sock);
+ if (unlikely(!mptcp_is_writeable(msk)))
+ mptcp_nospace(msk);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
@@ -864,6 +1147,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct page_frag *pfrag;
size_t copied = 0;
struct sock *ssk;
+ u32 sndbuf;
bool tx_ok;
long timeo;
@@ -890,7 +1174,7 @@ restart:
}
__mptcp_flush_join_list(msk);
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
while (!sk_stream_memory_free(sk) ||
!ssk ||
!mptcp_page_frag_refill(ssk, pfrag)) {
@@ -907,19 +1191,25 @@ restart:
mptcp_reset_timer(sk);
}
+ mptcp_nospace(msk);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
mptcp_clean_una(sk);
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
if (list_empty(&msk->conn_list)) {
ret = -ENOTCONN;
goto out;
}
}
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
@@ -936,6 +1226,10 @@ restart:
break;
}
+ /* burst can be negative, we will try move to the next subflow
+ * at selection time, if possible.
+ */
+ msk->snd_burst -= ret;
copied += ret;
tx_ok = msg_data_left(msg);
@@ -945,7 +1239,6 @@ restart:
if (!sk_stream_memory_free(ssk) ||
!mptcp_page_frag_refill(ssk, pfrag) ||
!mptcp_ext_cache_refill(msk)) {
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
tcp_push(ssk, msg->msg_flags, mss_now,
tcp_sk(ssk)->nonagle, size_goal);
mptcp_set_timeout(sk, ssk);
@@ -993,9 +1286,9 @@ restart:
mptcp_reset_timer(sk);
}
- ssk_check_wmem(msk, ssk);
release_sock(ssk);
out:
+ ssk_check_wmem(msk);
release_sock(sk);
return copied ? : ret;
}
@@ -1133,10 +1426,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
+ bool slow;
ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
tcp_sk(ssk)->window_clamp = window_clamp;
+ tcp_cleanup_rbuf(ssk, 1);
+ unlock_sock_fast(ssk, slow);
}
}
}
@@ -1152,6 +1449,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool done;
+ /* avoid looping forever below on racing close */
+ if (((struct sock *)msk)->sk_state == TCP_CLOSE)
+ return false;
+
+ __mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
@@ -1163,7 +1465,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
release_sock(ssk);
} while (!done);
- return moved > 0;
+ if (mptcp_ofo_queue(msk) || moved > 0) {
+ mptcp_check_data_fin((struct sock *)msk);
+ return true;
+ }
+ return false;
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -1259,6 +1565,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
+ msk, test_bit(MPTCP_DATA_READY, &msk->flags),
+ skb_queue_empty(&sk->sk_receive_queue), copied);
mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
@@ -1309,9 +1618,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (__mptcp_check_fallback(msk))
+ return msk->first;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
/* still data outstanding at TCP level? Don't retransmit. */
if (!tcp_write_queue_empty(ssk))
return NULL;
@@ -1472,6 +1787,7 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker);
+ msk->out_of_order_queue = RB_ROOT;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -1505,7 +1821,7 @@ static int mptcp_init_sock(struct sock *sk)
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
return 0;
}
@@ -1811,6 +2127,7 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_token_destroy(msk);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
@@ -2286,13 +2603,13 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
+ pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
if (state == TCP_LISTEN)
return mptcp_check_readable(msk);
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
- if (sk_stream_is_writeable(sk) &&
- test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
mask |= EPOLLOUT | EPOLLWRNORM;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 60b27d44c184..493bd2c13bc6 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -140,6 +140,8 @@ struct mptcp_addr_info {
sa_family_t family;
__be16 port;
u8 id;
+ u8 flags;
+ int ifindex;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -194,6 +196,8 @@ struct mptcp_sock {
u64 write_seq;
u64 ack_seq;
u64 rcv_data_fin_seq;
+ struct sock *last_snd;
+ int snd_burst;
atomic64_t snd_una;
unsigned long timer_ival;
u32 token;
@@ -204,6 +208,8 @@ struct mptcp_sock {
bool snd_data_fin_enable;
spinlock_t join_list_lock;
struct work_struct work;
+ struct sk_buff *ooo_last_skb;
+ struct rb_root out_of_order_queue;
struct list_head conn_list;
struct list_head rtx_queue;
struct list_head join_list;
@@ -268,6 +274,12 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
return (struct mptcp_subflow_request_sock *)rsk;
}
+enum mptcp_data_avail {
+ MPTCP_SUBFLOW_NODATA,
+ MPTCP_SUBFLOW_DATA_AVAIL,
+ MPTCP_SUBFLOW_OOO_DATA
+};
+
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -292,10 +304,10 @@ struct mptcp_subflow_context {
map_valid : 1,
mpc_map : 1,
backup : 1,
- data_avail : 1,
rx_eof : 1,
use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
can_ack : 1; /* only after processing the remote a key */
+ enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -350,8 +362,7 @@ bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
/* called with sk socket lock held */
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
@@ -464,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
-static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
{
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
-static inline bool mptcp_check_fallback(struct sock *sk)
+static inline bool mptcp_check_fallback(const struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index e8cac2655c82..a2ae3087e24d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,6 +20,7 @@
#include <net/ip6_route.h>
#endif
#include <net/mptcp.h>
+#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
@@ -434,6 +435,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
+ skb_rbtree_purge(&mptcp_sk(sk)->out_of_order_queue);
mptcp_token_destroy(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -804,16 +806,25 @@ validate_seq:
return MAPPING_OK;
}
-static int subflow_read_actor(read_descriptor_t *desc,
- struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
+ unsigned int limit)
{
- size_t copy_len = min(desc->count, len);
-
- desc->count -= copy_len;
-
- pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ u32 incr;
+
+ incr = limit >= skb->len ? skb->len + fin : limit;
+
+ pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
+ subflow->map_subflow_seq);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
+ tcp_sk(ssk)->copied_seq += incr;
+ if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
+ sk_eat_skb(ssk, skb);
+ if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
+ subflow->map_valid = 0;
+ if (incr)
+ tcp_cleanup_rbuf(ssk, incr);
}
static bool subflow_check_data_avail(struct sock *ssk)
@@ -825,13 +836,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (!skb_peek(&ssk->sk_receive_queue))
+ subflow->data_avail = 0;
if (subflow->data_avail)
return true;
msk = mptcp_sk(subflow->conn);
for (;;) {
- u32 map_remaining;
- size_t delta;
u64 ack_seq;
u64 old_ack;
@@ -849,6 +860,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
subflow->ssn_offset;
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
return true;
}
@@ -876,42 +888,18 @@ static bool subflow_check_data_avail(struct sock *ssk)
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
ack_seq);
- if (ack_seq == old_ack)
+ if (ack_seq == old_ack) {
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
break;
+ } else if (after64(ack_seq, old_ack)) {
+ subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
+ break;
+ }
/* only accept in-sequence mapping. Old values are spurious
- * retransmission; we can hit "future" values on active backup
- * subflow switch, we relay on retransmissions to get
- * in-sequence data.
- * Cuncurrent subflows support will require subflow data
- * reordering
+ * retransmission
*/
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- if (before64(ack_seq, old_ack))
- delta = min_t(size_t, old_ack - ack_seq, map_remaining);
- else
- delta = min_t(size_t, ack_seq - old_ack, map_remaining);
-
- /* discard mapped data */
- pr_debug("discarding %zu bytes, current map len=%d", delta,
- map_remaining);
- if (delta) {
- read_descriptor_t desc = {
- .count = delta,
- };
- int ret;
-
- ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
- if (ret < 0) {
- ssk->sk_err = -ret;
- goto fatal;
- }
- if (ret < delta)
- return false;
- if (delta == map_remaining)
- subflow->map_valid = 0;
- }
+ mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
}
return true;
@@ -922,13 +910,13 @@ fatal:
ssk->sk_error_report(ssk);
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
+ subflow->data_avail = 0;
return false;
}
bool mptcp_subflow_data_available(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sk_buff *skb;
/* check if current mapping is still valid */
if (subflow->map_valid &&
@@ -941,15 +929,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
subflow->map_data_len);
}
- if (!subflow_check_data_avail(sk)) {
- subflow->data_avail = 0;
- return false;
- }
-
- skb = skb_peek(&sk->sk_receive_queue);
- subflow->data_avail = skb &&
- before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
- return subflow->data_avail;
+ return subflow_check_data_avail(sk);
}
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
@@ -996,8 +976,10 @@ static void subflow_write_space(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- sk_stream_write_space(sk);
- if (sk_stream_is_writeable(sk)) {
+ if (!sk_stream_is_writeable(sk))
+ return;
+
+ if (sk_stream_is_writeable(parent)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
@@ -1056,8 +1038,7 @@ static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1101,7 +1082,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- ssk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = loc->ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
@@ -1112,7 +1093,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
subflow->remote_token = remote_token;
subflow->local_id = local_id;
subflow->request_join = 1;
- subflow->request_bkup = 1;
+ subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index aa254aefc2c3..00bb158b4a5d 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -5,7 +5,8 @@ KSFT_KHDR_INSTALL := 1
CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include
-TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh
+TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
+ simult_flows.sh
TEST_GEN_FILES = mptcp_connect pm_nl_ctl
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
new file mode 100755
index 000000000000..0d88225daa02
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+ns2="ns2-$rndh"
+ns3="ns3-$rndh"
+capture=false
+ksft_skip=4
+timeout=30
+test_cnt=1
+ret=0
+bail=0
+
+usage() {
+ echo "Usage: $0 [ -b ] [ -c ] [ -d ]"
+ echo -e "\t-b: bail out after first error, otherwise runs al testcases"
+ echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
+ echo -e "\t-d: debug this script"
+}
+
+cleanup()
+{
+ rm -f "$cin" "$cout"
+ rm -f "$sin" "$sout"
+ rm -f "$capout"
+
+ local netns
+ for netns in "$ns1" "$ns2" "$ns3";do
+ ip netns del $netns
+ done
+}
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+# "$ns1" ns2 ns3
+# ns1eth1 ns2eth1 ns2eth3 ns3eth1
+# netem
+# ns1eth2 ns2eth2
+# netem
+
+setup()
+{
+ large=$(mktemp)
+ small=$(mktemp)
+ sout=$(mktemp)
+ cout=$(mktemp)
+ capout=$(mktemp)
+ size=$((2048 * 4096))
+ dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
+ dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
+
+ trap cleanup EXIT
+
+ for i in "$ns1" "$ns2" "$ns3";do
+ ip netns add $i || exit $ksft_skip
+ ip -net $i link set lo up
+ done
+
+ ip link add ns1eth1 netns "$ns1" type veth peer name ns2eth1 netns "$ns2"
+ ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth2 netns "$ns2"
+ ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth1 netns "$ns3"
+
+ ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth1
+ ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth1 nodad
+ ip -net "$ns1" link set ns1eth1 up mtu 1500
+ ip -net "$ns1" route add default via 10.0.1.2
+ ip -net "$ns1" route add default via dead:beef:1::2
+
+ ip -net "$ns1" addr add 10.0.2.1/24 dev ns1eth2
+ ip -net "$ns1" addr add dead:beef:2::1/64 dev ns1eth2 nodad
+ ip -net "$ns1" link set ns1eth2 up mtu 1500
+ ip -net "$ns1" route add default via 10.0.2.2 metric 101
+ ip -net "$ns1" route add default via dead:beef:2::2 metric 101
+
+ ip netns exec "$ns1" ./pm_nl_ctl limits 1 1
+ ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow
+ ip netns exec "$ns1" sysctl -q net.ipv4.conf.all.rp_filter=0
+
+ ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
+ ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
+ ip -net "$ns2" link set ns2eth1 up mtu 1500
+
+ ip -net "$ns2" addr add 10.0.2.2/24 dev ns2eth2
+ ip -net "$ns2" addr add dead:beef:2::2/64 dev ns2eth2 nodad
+ ip -net "$ns2" link set ns2eth2 up mtu 1500
+
+ ip -net "$ns2" addr add 10.0.3.2/24 dev ns2eth3
+ ip -net "$ns2" addr add dead:beef:3::2/64 dev ns2eth3 nodad
+ ip -net "$ns2" link set ns2eth3 up mtu 1500
+ ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1
+ ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ ip -net "$ns3" addr add 10.0.3.3/24 dev ns3eth1
+ ip -net "$ns3" addr add dead:beef:3::3/64 dev ns3eth1 nodad
+ ip -net "$ns3" link set ns3eth1 up mtu 1500
+ ip -net "$ns3" route add default via 10.0.3.2
+ ip -net "$ns3" route add default via dead:beef:3::2
+
+ ip netns exec "$ns3" ./pm_nl_ctl limits 1 1
+}
+
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+ local listener_ns="${1}"
+ local port="${2}"
+
+ local port_hex i
+
+ port_hex="$(printf "%04X" "${port}")"
+ for i in $(seq 10); do
+ ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+ awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+ break
+ sleep 0.1
+ done
+}
+
+do_transfer()
+{
+ local cin=$1
+ local sin=$2
+ local max_time=$3
+ local port
+ port=$((10000+$test_cnt))
+ test_cnt=$((test_cnt+1))
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ local addr_port
+ addr_port=$(printf "%s:%d" ${connect_addr} ${port})
+
+ if $capture; then
+ local capuser
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ local capfile="${rndh}-${port}"
+ local capopt="-i any -s 65535 -B 32768 ${capuser}"
+
+ ip netns exec ${ns3} tcpdump ${capopt} -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 &
+ local cappid_listener=$!
+
+ ip netns exec ${ns1} tcpdump ${capopt} -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 &
+ local cappid_connector=$!
+
+ sleep 1
+ fi
+
+ ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" &
+ local spid=$!
+
+ wait_local_port_listen "${ns3}" "${port}"
+
+ local start
+ start=$(date +%s%3N)
+ ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" &
+ local cpid=$!
+
+ wait $cpid
+ local retc=$?
+ wait $spid
+ local rets=$?
+
+ local stop
+ stop=$(date +%s%3N)
+
+ if $capture; then
+ sleep 1
+ kill ${cappid_listener}
+ kill ${cappid_connector}
+ fi
+
+ local duration
+ duration=$((stop-start))
+
+ cmp $sin $cout > /dev/null 2>&1
+ local cmps=$?
+ cmp $cin $sout > /dev/null 2>&1
+ local cmpc=$?
+
+ printf "%16s" "$duration max $max_time "
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ] && \
+ [ $cmpc -eq 0 ] && [ $cmps -eq 0 ] && \
+ [ $duration -lt $max_time ]; then
+ echo "[ OK ]"
+ cat "$capout"
+ return 0
+ fi
+
+ echo " [ fail ]"
+ echo "client exit code $retc, server $rets" 1>&2
+ echo "\nnetns ${ns3} socket stat for $port:" 1>&2
+ ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
+ echo "\nnetns ${ns1} socket stat for $port:" 1>&2
+ ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+ ls -l $sin $cout
+ ls -l $cin $sout
+
+ cat "$capout"
+ return 1
+}
+
+run_test()
+{
+ local rate1=$1
+ local rate2=$2
+ local delay1=$3
+ local delay2=$4
+ local lret
+ local dev
+ shift 4
+ local msg=$*
+
+ [ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1=""
+ [ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2=""
+
+ for dev in ns1eth1 ns1eth2; do
+ tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1
+ done
+ for dev in ns2eth1 ns2eth2; do
+ tc -n $ns2 qdisc del dev $dev root >/dev/null 2>&1
+ done
+ tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1
+ tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2
+ tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1
+ tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2
+
+ # time is measure in ms
+ local time=$((size * 8 * 1000 / (( $rate1 + $rate2) * 1024 *1024) ))
+
+ # mptcp_connect will do some sleeps to allow the mp_join handshake
+ # completion
+ time=$((time + 1350))
+
+ printf "%-50s" "$msg"
+ do_transfer $small $large $((time * 11 / 10))
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ [ $bail -eq 0 ] || exit $ret
+ fi
+
+ printf "%-50s" "$msg - reverse direction"
+ do_transfer $large $small $((time * 11 / 10))
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ [ $bail -eq 0 ] || exit $ret
+ fi
+}
+
+while getopts "bcdh" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "b")
+ bail=1
+ ;;
+ "c")
+ capture=true
+ ;;
+ "d")
+ set -x
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+setup
+run_test 10 10 0 0 "balanced bwidth"
+run_test 10 10 1 50 "balanced bwidth with unbalanced delay"
+
+# we still need some additional infrastructure to pass the following test-cases
+# run_test 30 10 0 0 "unbalanced bwidth"
+# run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay"
+# run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay"
+exit $ret