From 0eb71a9da5796851fa87ddc1a534066c0fe54055 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: split rhashtable.h Due to the use of rhashtables in net namespaces, rhashtable.h is included in lots of the kernel, so a small changes can required a large recompilation. This makes development painful. This patch splits out rhashtable-types.h which just includes the major type declarations, and does not include (non-trivial) inline code. rhashtable.h is no longer included by anything in the include/ directory. Common include files only include rhashtable-types.h so a large recompilation is only triggered when that changes. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/inet_frag.h') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index ed07e3786d98..f4272a29dc44 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,7 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ -#include +#include struct netns_frags { /* sysctls */ -- cgit v1.2.3 From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:39 +0000 Subject: ip: use rb trees for IP frag queue. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn Reported-by: Juha-Matti Tilli Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +- include/net/inet_frag.h | 3 +- net/ipv4/inet_fragment.c | 16 +-- net/ipv4/ip_fragment.c | 182 ++++++++++++++++++-------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + net/ipv6/reassembly.c | 1 + 6 files changed, 121 insertions(+), 91 deletions(-) (limited to 'include/net/inet_frag.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47848367c816..7ebdf158a795 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,16 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; - int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; }; - struct sock *sk; + + union { + struct sock *sk; + int ip_defrag_offset; + }; union { ktime_t tstamp; diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index f4272a29dc44..b86d14528188 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -75,7 +75,8 @@ struct inet_frag_queue { struct timer_list timer; spinlock_t lock; refcount_t refcnt; - struct sk_buff *fragments; + struct sk_buff *fragments; /* Used in IPv6. */ + struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; ktime_t stamp; int len; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index ccd140e4082d..6d258a5669e7 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q) fp = q->fragments; nf = q->net; f = nf->f; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; + if (fp) { + do { + struct sk_buff *xp = fp->next; + + sum_truesize += fp->truesize; + kfree_skb(fp); + fp = xp; + } while (fp); + } else { + sum_truesize = skb_rbtree_purge(&q->rb_fragments); } sum = sum_truesize + f->qsize; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 960bf5eab59f..0e8f8de77e71 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; - struct sk_buff *head; + struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; @@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - - head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + if (!qp->q.flags & INET_FRAG_FIRST_IN) goto out; + /* sk_buff::dev and sk_buff::rbnode are unionized. So we + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ + if (qp->q.fragments) { + head = qp->q.fragments; + qp->q.fragments = head->next; + } else { + head = skb_rb_first(&qp->q.rb_fragments); + if (!head) + goto out; + rb_erase(&head->rbnode, &qp->q.rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == qp->q.fragments_tail) + qp->q.fragments_tail = NULL; + + sub_frag_mem_limit(qp->q.net, head->truesize); + head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - skb_get(head); spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - kfree_skb(head); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); + if (head) + kfree_skb(head); ipq_put(qp); } @@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp) end = atomic_inc_return(&peer->rid); qp->rid = end; - rc = qp->q.fragments && (end - start) > max; + rc = qp->q.fragments_tail && (end - start) > max; if (rc) { struct net *net; @@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { - struct sk_buff *fp; unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { @@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - fp = qp->q.fragments; - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); + sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->iif = 0; qp->ecn = 0; @@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct sk_buff *prev, *next; + struct rb_node **rbn, *parent; + struct sk_buff *skb1; struct net_device *dev; unsigned int fragsize; int flags, offset; @@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (err) goto err; - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = qp->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = qp->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); -found: /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * - * We do the same here for IPv4. + * We do the same here for IPv4 (and increment an snmp counter). */ - /* Is there an overlap with the previous fragment? */ - if (prev && - (prev->ip_defrag_offset + prev->len) > offset) - goto discard_qp; - - /* Is there an overlap with the next fragment? */ - if (next && next->ip_defrag_offset < end) - goto discard_qp; + /* Find out where to put this fragment. */ + skb1 = qp->q.fragments_tail; + if (!skb1) { + /* This is the first fragment we've received. */ + rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); + qp->q.fragments_tail = skb; + } else if ((skb1->ip_defrag_offset + skb1->len) < end) { + /* This is the common/special case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < (skb1->ip_defrag_offset + skb1->len)) + goto discard_qp; + /* Insert after skb1. */ + rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); + qp->q.fragments_tail = skb; + } else { + /* Binary search. Note that skb can become the first fragment, but + * not the last (covered above). */ + rbn = &qp->q.rb_fragments.rb_node; + do { + parent = *rbn; + skb1 = rb_to_skb(parent); + if (end <= skb1->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= skb1->ip_defrag_offset + skb1->len) + rbn = &parent->rb_right; + else /* Found an overlap with skb1. */ + goto discard_qp; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. */ + rb_link_node(&skb->rbnode, parent, rbn); + } + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ - dev = skb->dev; if (dev) qp->iif = dev->ifindex; - /* Makes sure compiler wont do silly aliasing games */ - barrier(); skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - qp->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - qp->q.fragments = skb; - qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -414,7 +425,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, prev, dev); + err = ip_frag_reasm(qp, skb, dev); skb->_skb_refdst = orefdst; return err; } @@ -431,15 +442,15 @@ err: return err; } - /* Build a new IP datagram from all its fragments. */ - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = qp->q.fragments; + struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); + struct sk_buff **nextp; /* To build frag_list. */ + struct rb_node *rbn; int len; int ihlen; int err; @@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_fail; } /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); if (!fp) goto out_nomem; - - fp->next = head->next; - if (!fp->next) + rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); + if (qp->q.fragments_tail == skb) qp->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, qp->q.fragments); - head->next = qp->q.fragments->next; - - consume_skb(qp->q.fragments); - qp->q.fragments = head; + skb_morph(skb, head); + rb_replace_node(&head->rbnode, &skb->rbnode, + &qp->q.rb_fragments); + consume_skb(head); + head = skb; } - WARN_ON(!head); WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ @@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; - clone->next = head->next; - head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; + skb->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { + /* Traverse the tree in order, to build frag_list. */ + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &qp->q.rb_fragments); + while (rbn) { + struct rb_node *rbnext = rb_next(rbn); + fp = rb_to_skb(rbn); + rb_erase(rbn, &qp->q.rb_fragments); + rbn = rbnext; + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) @@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } sub_frag_mem_limit(qp->q.net, head->truesize); + *nextp = NULL; head->next = NULL; + head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); @@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0610bdab721c..38d69ef516d5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic head->csum); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return true; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 6edd2ac8ae4b..b4e558ab39fa 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return 1; -- cgit v1.2.3 From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Sat, 11 Aug 2018 20:27:24 +0000 Subject: ip: add helpers to process in-order fragments faster. This patch introduces several helper functions/macros that will be used in the follow-up patch. No runtime changes yet. The new logic (fully implemented in the second patch) is as follows: * Nodes in the rb-tree will now contain not single fragments, but lists of consecutive fragments ("runs"). * At each point in time, the current "active" run at the tail is maintained/tracked. Fragments that arrive in-order, adjacent to the previous tail fragment, are added to this tail run without triggering the re-balancing of the rb-tree. * If a fragment arrives out of order with the offset _before_ the tail run, it is inserted into the rb-tree as a single fragment. * If a fragment arrives after the current tail fragment (with a gap), it starts a new "tail" run, as is inserted into the rb-tree at the end as the head of the new run. skb->cb is used to store additional information needed here (suggested by Eric Dumazet). Reported-by: Willem de Bruijn Signed-off-by: Peter Oskolkov Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 6 ++++ net/ipv4/ip_fragment.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include/net/inet_frag.h') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index b86d14528188..1662cbc0b46b 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -57,7 +57,9 @@ struct frag_v6_compare_key { * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head + * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail + * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far @@ -78,6 +80,7 @@ struct inet_frag_queue { struct sk_buff *fragments; /* Used in IPv6. */ struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; + struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; @@ -113,6 +116,9 @@ void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); +/* Free all skbs in the queue; return the sum of their truesizes. */ +unsigned int inet_frag_rbtree_purge(struct rb_root *root); + static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7cb7ed761d8c..26ace9d2d976 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,6 +57,57 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + struct inet_skb_parm h; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void ip4_frag_init_run(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) +{ + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + ip4_frag_init_run(skb); + q->fragments_tail = skb; + q->last_run_head = skb; +} + /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -654,6 +705,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + #ifdef CONFIG_SYSCTL static int dist_min; -- cgit v1.2.3