diff options
Diffstat (limited to 'include/linux/skbuff.h')
-rw-r--r-- | include/linux/skbuff.h | 392 |
1 files changed, 308 insertions, 84 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dbe29b6c9bd6..bc486ef23f20 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -15,13 +15,13 @@ #define _LINUX_SKBUFF_H #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/cache.h> #include <linux/rbtree.h> #include <linux/socket.h> +#include <linux/refcount.h> #include <linux/atomic.h> #include <asm/types.h> @@ -345,6 +345,42 @@ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) frag->size -= delta; } +static inline bool skb_frag_must_loop(struct page *p) +{ +#if defined(CONFIG_HIGHMEM) + if (PageHighMem(p)) + return true; +#endif + return false; +} + +/** + * skb_frag_foreach_page - loop over pages in a fragment + * + * @f: skb frag to operate on + * @f_off: offset from start of f->page.p + * @f_len: length from f_off to loop over + * @p: (temp var) current page + * @p_off: (temp var) offset from start of current page, + * non-zero only on first page. + * @p_len: (temp var) length in current page, + * < PAGE_SIZE only on first and last page. + * @copied: (temp var) length so far, excluding current p_len. + * + * A fragment can hold a compound page, in which case per-page + * operations, notably kmap_atomic, must be called for each + * regular page. + */ +#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \ + for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \ + p_off = (f_off) & (PAGE_SIZE - 1), \ + p_len = skb_frag_must_loop(p) ? \ + min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \ + copied = 0; \ + copied < f_len; \ + copied += p_len, p++, p_off = 0, \ + p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ + #define HAVE_HW_TIME_STAMP /** @@ -393,6 +429,7 @@ enum { SKBTX_SCHED_TSTAMP = 1 << 6, }; +#define SKBTX_ZEROCOPY_FRAG (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG) #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) @@ -407,16 +444,53 @@ enum { */ struct ubuf_info { void (*callback)(struct ubuf_info *, bool zerocopy_success); - void *ctx; - unsigned long desc; + union { + struct { + unsigned long desc; + void *ctx; + }; + struct { + u32 id; + u16 len; + u16 zerocopy:1; + u32 bytelen; + }; + }; + refcount_t refcnt; + + struct mmpin { + struct user_struct *user; + unsigned int num_pg; + } mmp; }; +#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) + +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); +struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg); + +static inline void sock_zerocopy_get(struct ubuf_info *uarg) +{ + refcount_inc(&uarg->refcnt); +} + +void sock_zerocopy_put(struct ubuf_info *uarg); +void sock_zerocopy_put_abort(struct ubuf_info *uarg); + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg); + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - unsigned short _unused; - unsigned char nr_frags; + __u8 __unused; + __u8 meta_len; + __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ @@ -425,7 +499,6 @@ struct skb_shared_info { struct skb_shared_hwtstamps hwtstamps; unsigned int gso_type; u32 tskey; - __be32 ip6_frag_id; /* * Warning : all fields before dataref are cleared in __alloc_skb() @@ -463,39 +536,40 @@ enum { enum { SKB_GSO_TCPV4 = 1 << 0, - SKB_GSO_UDP = 1 << 1, /* This indicates the skb is from an untrusted source. */ - SKB_GSO_DODGY = 1 << 2, + SKB_GSO_DODGY = 1 << 1, /* This indicates the tcp segment has CWR set. */ - SKB_GSO_TCP_ECN = 1 << 3, + SKB_GSO_TCP_ECN = 1 << 2, + + SKB_GSO_TCP_FIXEDID = 1 << 3, - SKB_GSO_TCP_FIXEDID = 1 << 4, + SKB_GSO_TCPV6 = 1 << 4, - SKB_GSO_TCPV6 = 1 << 5, + SKB_GSO_FCOE = 1 << 5, - SKB_GSO_FCOE = 1 << 6, + SKB_GSO_GRE = 1 << 6, - SKB_GSO_GRE = 1 << 7, + SKB_GSO_GRE_CSUM = 1 << 7, - SKB_GSO_GRE_CSUM = 1 << 8, + SKB_GSO_IPXIP4 = 1 << 8, - SKB_GSO_IPXIP4 = 1 << 9, + SKB_GSO_IPXIP6 = 1 << 9, - SKB_GSO_IPXIP6 = 1 << 10, + SKB_GSO_UDP_TUNNEL = 1 << 10, - SKB_GSO_UDP_TUNNEL = 1 << 11, + SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, - SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12, + SKB_GSO_PARTIAL = 1 << 12, - SKB_GSO_PARTIAL = 1 << 13, + SKB_GSO_TUNNEL_REMCSUM = 1 << 13, - SKB_GSO_TUNNEL_REMCSUM = 1 << 14, + SKB_GSO_SCTP = 1 << 14, - SKB_GSO_SCTP = 1 << 15, + SKB_GSO_ESP = 1 << 15, - SKB_GSO_ESP = 1 << 16, + SKB_GSO_UDP = 1 << 16, }; #if BITS_PER_LONG > 32 @@ -543,6 +617,7 @@ typedef unsigned char *sk_buff_data_t; * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function + * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on @@ -588,8 +663,12 @@ struct sk_buff { struct sk_buff *prev; union { - ktime_t tstamp; - u64 skb_mstamp; + struct net_device *dev; + /* Some protocols might use this space to store information, + * while device pointer would be NULL. + * UDP receive path is one user. + */ + unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ @@ -597,12 +676,8 @@ struct sk_buff { struct sock *sk; union { - struct net_device *dev; - /* Some protocols might use this space to store information, - * while device pointer would be NULL. - * UDP receive path is one user. - */ - unsigned long dev_scratch; + ktime_t tstamp; + u64 skb_mstamp; }; /* * This is the control buffer. It is free to use for every @@ -612,8 +687,14 @@ struct sk_buff { */ char cb[48] __aligned(8); - unsigned long _skb_refdst; - void (*destructor)(struct sk_buff *skb); + union { + struct { + unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); + }; + struct list_head tcp_tsorted_anchor; + }; + #ifdef CONFIG_XFRM struct sec_path *sp; #endif @@ -631,7 +712,6 @@ struct sk_buff { /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ - kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ @@ -650,7 +730,6 @@ struct sk_buff { head_frag:1, xmit_more:1, __unused:1; /* one bit hole */ - kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -698,6 +777,7 @@ struct sk_buff { __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; + __u8 offload_mr_fwd_mark:1; #endif #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; @@ -885,7 +965,7 @@ void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_tx_error(struct sk_buff *skb); void consume_skb(struct sk_buff *skb); -void consume_stateless_skb(struct sk_buff *skb); +void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_head_cache; @@ -945,12 +1025,6 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } -struct sk_buff *__alloc_skb_head(gfp_t priority, int node); -static inline struct sk_buff *alloc_skb_head(gfp_t priority) -{ - return __alloc_skb_head(priority, -1); -} - struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); @@ -973,7 +1047,23 @@ int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); -int skb_pad(struct sk_buff *skb, int pad); +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error. + */ +static inline int skb_pad(struct sk_buff *skb, int pad) +{ + return __skb_pad(skb, pad, true); +} #define dev_kfree_skb(a) consume_skb(a) int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, @@ -1129,8 +1219,6 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6); - static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { @@ -1143,20 +1231,6 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 return skb->hash; } -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl); - -static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) -{ - if (!skb->l4_hash && !skb->sw_hash) { - struct flow_keys keys; - __u32 hash = __get_hash_from_flowi4(fl4, &keys); - - __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); - } - - return skb->hash; -} - __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) @@ -1201,6 +1275,50 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) return &skb_shinfo(skb)->hwtstamps; } +static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) +{ + bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY; + + return is_zcopy ? skb_uarg(skb) : NULL; +} + +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +{ + if (skb && uarg && !skb_zcopy(skb)) { + sock_zerocopy_get(uarg); + skb_shinfo(skb)->destructor_arg = uarg; + skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; + } +} + +/* Release a reference on a zerocopy structure */ +static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) +{ + struct ubuf_info *uarg = skb_zcopy(skb); + + if (uarg) { + if (uarg->callback == sock_zerocopy_callback) { + uarg->zerocopy = uarg->zerocopy && zerocopy; + sock_zerocopy_put(uarg); + } else { + uarg->callback(uarg, zerocopy); + } + + skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + } +} + +/* Abort a zerocopy operation and revert zckey on error in send syscall */ +static inline void skb_zcopy_abort(struct sk_buff *skb) +{ + struct ubuf_info *uarg = skb_zcopy(skb); + + if (uarg) { + sock_zerocopy_put_abort(uarg); + skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + } +} + /** * skb_queue_empty - check if a queue is empty * @list: queue head @@ -1346,27 +1464,8 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) } /** - * skb_header_release - release reference to header - * @skb: buffer to operate on - * - * Drop a reference to the header part of the buffer. This is done - * by acquiring a payload reference. You must not read from the header - * part of skb->data after this. - * Note : Check if you can use __skb_header_release() instead. - */ -static inline void skb_header_release(struct sk_buff *skb) -{ - BUG_ON(skb->nohdr); - skb->nohdr = 1; - atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); -} - -/** * __skb_header_release - release reference to header * @skb: buffer to operate on - * - * Variant of skb_header_release() assuming skb is private to caller. - * We can avoid one atomic operation. */ static inline void __skb_header_release(struct sk_buff *skb) { @@ -1783,13 +1882,18 @@ static inline unsigned int skb_headlen(const struct sk_buff *skb) return skb->len - skb->data_len; } -static inline unsigned int skb_pagelen(const struct sk_buff *skb) +static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--) len += skb_frag_size(&skb_shinfo(skb)->frags[i]); - return len + skb_headlen(skb); + return len; +} + +static inline unsigned int skb_pagelen(const struct sk_buff *skb) +{ + return skb_headlen(skb) + __skb_pagelen(skb); } /** @@ -2434,7 +2538,17 @@ static inline void skb_orphan(struct sk_buff *skb) */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { - if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY))) + if (likely(!skb_zcopy(skb))) + return 0; + if (skb_uarg(skb)->callback == sock_zerocopy_callback) + return 0; + return skb_copy_ubufs(skb, gfp_mask); +} + +/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ +static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) +{ + if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } @@ -2549,7 +2663,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ - gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; + gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } @@ -2825,25 +2939,42 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length + * @free_on_error: free buffer on error * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on - * success. The skb is freed on error. + * success. The skb is freed on error if @free_on_error is true. */ -static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, + bool free_on_error) { unsigned int size = skb->len; if (unlikely(size < len)) { len -= size; - if (skb_pad(skb, len)) + if (__skb_pad(skb, len, free_on_error)) return -ENOMEM; __skb_put(skb, len); } return 0; } +/** + * skb_put_padto - increase size and pad an skbuff up to a minimal size + * @skb: buffer to pad + * @len: minimal length + * + * Pads up a buffer to ensure the trailing bytes exist and are + * blanked. If the buffer already contains sufficient data it + * is untouched. Otherwise it is extended. Returns zero on + * success. The skb is freed on error. + */ +static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +{ + return __skb_put_padto(skb, len, true); +} + static inline int skb_add_data(struct sk_buff *skb, struct iov_iter *from, int copy) { @@ -2866,6 +2997,8 @@ static inline int skb_add_data(struct sk_buff *skb, static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { + if (skb_zcopy(skb)) + return false; if (i) { const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; @@ -3023,6 +3156,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) return __skb_grow(skb, len); } +#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) +#define skb_rb_first(root) rb_to_skb(rb_first(root)) +#define skb_rb_last(root) rb_to_skb(rb_last(root)) +#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) +#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ @@ -3037,6 +3176,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) +#define skb_rbtree_walk(skb, root) \ + for (skb = skb_rb_first(root); skb != NULL; \ + skb = skb_rb_next(skb)) + +#define skb_rbtree_walk_from(skb) \ + for (; skb != NULL; \ + skb = skb_rb_next(skb)) + +#define skb_rbtree_walk_from_safe(skb, tmp) \ + for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ + skb = tmp) + #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ @@ -3120,6 +3271,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, unsigned int flags); +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len); +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, @@ -3271,6 +3425,69 @@ static inline ktime_t net_invalid_timestamp(void) return 0; } +static inline u8 skb_metadata_len(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->meta_len; +} + +static inline void *skb_metadata_end(const struct sk_buff *skb) +{ + return skb_mac_header(skb); +} + +static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b, + u8 meta_len) +{ + const void *a = skb_metadata_end(skb_a); + const void *b = skb_metadata_end(skb_b); + /* Using more efficient varaiant than plain call to memcmp(). */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + u64 diffs = 0; + + switch (meta_len) { +#define __it(x, op) (x -= sizeof(u##op)) +#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) + case 32: diffs |= __it_diff(a, b, 64); + case 24: diffs |= __it_diff(a, b, 64); + case 16: diffs |= __it_diff(a, b, 64); + case 8: diffs |= __it_diff(a, b, 64); + break; + case 28: diffs |= __it_diff(a, b, 64); + case 20: diffs |= __it_diff(a, b, 64); + case 12: diffs |= __it_diff(a, b, 64); + case 4: diffs |= __it_diff(a, b, 32); + break; + } + return diffs; +#else + return memcmp(a - meta_len, b - meta_len, meta_len); +#endif +} + +static inline bool skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b) +{ + u8 len_a = skb_metadata_len(skb_a); + u8 len_b = skb_metadata_len(skb_b); + + if (!(len_a | len_b)) + return false; + + return len_a != len_b ? + true : __skb_metadata_differs(skb_a, skb_b, len_a); +} + +static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) +{ + skb_shinfo(skb)->meta_len = meta_len; +} + +static inline void skb_metadata_clear(struct sk_buff *skb) +{ + skb_metadata_set(skb, 0); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING @@ -3622,6 +3839,13 @@ static inline void nf_reset_trace(struct sk_buff *skb) #endif } +static inline void ipvs_reset(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IP_VS) + skb->ipvs_property = 0; +#endif +} + /* Note: This doesn't put any conntrack and bridge info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) |