diff options
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r-- | net/core/skbuff.c | 172 |
1 files changed, 127 insertions, 45 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e3e0087245b..f86bf69cfb8d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,7 +72,7 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <trace/events/skb.h> #include <linux/highmem.h> #include <linux/capability.h> @@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_set(&fclones->fclone_ref, 1); fclones->skb2.fclone = SKB_FCLONE_CLONE; - fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -354,7 +353,7 @@ EXPORT_SYMBOL(build_skb); struct napi_alloc_cache { struct page_frag_cache page; - size_t skb_count; + unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -369,7 +368,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, fragsz, gfp_mask); + data = page_frag_alloc(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -391,7 +390,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(&nc->page, fragsz, gfp_mask); + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -441,7 +440,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, len, gfp_mask); + data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); @@ -505,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(&nc->page, len, gfp_mask); + data = page_frag_alloc(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb) skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); + nf_conntrack_put(skb_nfct(skb)); #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); @@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); -#ifdef CONFIG_NET_CLS_ACT - CHECK_SKB_FIELD(tc_verd); -#endif #endif } @@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i; - u8 *data; - int size = nhead + skb_end_offset(skb) + ntail; + int i, osize = skb_end_offset(skb); + int size = osize + nhead + ntail; long off; + u8 *data; BUG_ON(nhead < 0); @@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + + /* It is not generally safe to change skb->truesize. + * For the moment, we really care of rx path, or + * when skb is orphaned (not attached to a socket). + */ + if (!skb->sk || skb->destructor == sock_edemux) + skb->truesize += size - osize; + return 0; nofrags: @@ -2656,7 +2660,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) struct skb_frag_struct *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + if (skb_headlen(skb)) + return 0; todo = shiftlen; from = 0; @@ -3076,22 +3082,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (sg && csum && (mss != GSO_BY_FRAGS)) { if (!(features & NETIF_F_GSO_PARTIAL)) { struct sk_buff *iter; + unsigned int frag_len; if (!list_skb || !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) goto normal; - /* Split the buffer at the frag_list pointer. - * This is based on the assumption that all - * buffers in the chain excluding the last - * containing the same amount of data. + /* If we get here then all the required + * GSO features except frag_list are supported. + * Try to split the SKB to multiple GSO SKBs + * with no frag_list. + * Currently we can do that only when the buffers don't + * have a linear part and all the buffers except + * the last are of the same length. */ + frag_len = list_skb->len; skb_walk_frags(head_skb, iter) { + if (frag_len != iter->len && iter->next) + goto normal; if (skb_headlen(iter)) goto normal; len -= iter->len; } + + if (len != frag_len) + goto normal; } /* GSO partial only requires that we trim off any excess that @@ -3688,6 +3704,15 @@ static void sock_rmem_free(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } +static void skb_set_err_queue(struct sk_buff *skb) +{ + /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. + * So, it is safe to (mis)use it to mark skbs on the error queue. + */ + skb->pkt_type = PACKET_OUTGOING; + BUILD_BUG_ON(PACKET_OUTGOING == 0); +} + /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ @@ -3701,6 +3726,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); @@ -3712,21 +3738,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); +static bool is_icmp_err_skb(const struct sk_buff *skb) +{ + return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); +} + struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; - struct sk_buff *skb, *skb_next; + struct sk_buff *skb, *skb_next = NULL; + bool icmp_next = false; unsigned long flags; - int err = 0; spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) - err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + icmp_next = is_icmp_err_skb(skb_next); spin_unlock_irqrestore(&q->lock, flags); - sk->sk_err = err; - if (err) + if (is_icmp_err_skb(skb) && !icmp_next) + sk->sk_err = 0; + + if (skb_next) sk->sk_error_report(sk); return skb; @@ -3769,16 +3803,21 @@ EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, - int tstype) + int tstype, + bool opt_stats) { struct sock_exterr_skb *serr; int err; + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; + serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && @@ -3814,13 +3853,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, if (!skb_may_tx_timestamp(sk, false)) return; - /* take a reference to prevent skb_orphan() from freeing the socket */ - sock_hold(sk); - - *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); - - sock_put(sk); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); + sock_put(sk); + } } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); @@ -3829,7 +3869,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly; + bool tsonly, opt_stats = false; if (!sk) return; @@ -3838,10 +3878,19 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!skb_may_tx_timestamp(sk, tsonly)) return; - if (tsonly) - skb = alloc_skb(0, GFP_ATOMIC); - else + if (tsonly) { +#ifdef CONFIG_INET + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { + skb = tcp_get_timestamping_opt_stats(sk); + opt_stats = true; + } else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + } if (!skb) return; @@ -3855,7 +3904,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, else skb->tstamp = ktime_get_real(); - __skb_complete_tx_timestamp(skb, sk, tstype); + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); @@ -3871,7 +3920,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; - int err; + int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; @@ -3881,14 +3930,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; - /* take a reference to prevent skb_orphan() from freeing the socket */ - sock_hold(sk); - - err = sock_queue_err_skb(sk, skb); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + } if (err) kfree_skb(skb); - - sock_put(sk); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); @@ -4350,7 +4400,7 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp.tv64 = 0; + skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -4913,3 +4963,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, return clone; } EXPORT_SYMBOL(pskb_extract); + +/** + * skb_condense - try to get rid of fragments/frag_list if possible + * @skb: buffer + * + * Can be used to save memory before skb is added to a busy queue. + * If packet has bytes in frags and enough tail room in skb->head, + * pull all of them, so that we can free the frags right now and adjust + * truesize. + * Notes: + * We do not reallocate skb->head thus can not fail. + * Caller must re-evaluate skb->truesize if needed. + */ +void skb_condense(struct sk_buff *skb) +{ + if (skb->data_len) { + if (skb->data_len > skb->end - skb->tail || + skb_cloned(skb)) + return; + + /* Nice, we can free page frag(s) right now */ + __pskb_pull_tail(skb, skb->data_len); + } + /* At this point, skb->truesize might be over estimated, + * because skb had a fragment, and fragments do not tell + * their truesize. + * When we pulled its content into skb->head, fragment + * was freed, but __pskb_pull_tail() could not possibly + * adjust skb->truesize, not knowing the frag truesize. + */ + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); +} |