summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c604
1 files changed, 419 insertions, 185 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e59840010d45..fe00d1208167 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -36,6 +36,8 @@
* The functions in this file will not compile correctly with gcc 2.4.x
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -67,10 +69,9 @@
#include <asm/uaccess.h>
#include <trace/events/skb.h>
+#include <linux/highmem.h>
-#include "kmap_skb.h"
-
-static struct kmem_cache *skbuff_head_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -119,11 +120,10 @@ static const struct pipe_buf_operations sock_pipe_buf_ops = {
*/
static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
{
- printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%#lx end:%#lx dev:%s\n",
- here, skb->len, sz, skb->head, skb->data,
- (unsigned long)skb->tail, (unsigned long)skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
+ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ __func__, here, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
BUG();
}
@@ -138,14 +138,50 @@ static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
{
- printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%#lx end:%#lx dev:%s\n",
- here, skb->len, sz, skb->head, skb->data,
- (unsigned long)skb->tail, (unsigned long)skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
+ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ __func__, here, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
BUG();
}
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
+ bool *pfmemalloc)
+{
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
+}
+
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* 'private' fields and also do memory statistics to find all the
* [BEEP] leaks.
@@ -156,26 +192,33 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
- * @fclone: allocate from fclone cache instead of head cache
- * and allocate a cloned (child) skb
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of size bytes. The object has a reference count of one.
- * The return is the buffer. On a failure the return is %NULL.
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int flags, int node)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
+ bool pfmemalloc;
+
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
- cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -190,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
*/
size = SKB_DATA_ALIGN(size);
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_node_track_caller(size, gfp_mask, node);
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
if (!data)
goto nodata;
/* kmalloc(size) might give us more room than requested.
@@ -208,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
memset(skb, 0, offsetof(struct sk_buff, tail));
/* Account for allocated memory : skb + skb->head */
skb->truesize = SKB_TRUESIZE(size);
+ skb->pfmemalloc = pfmemalloc;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -223,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(&shinfo->dataref, 1);
kmemcheck_annotate_variable(shinfo->destructor_arg);
- if (fclone) {
+ if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -233,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(fclone_ref, 1);
child->fclone = SKB_FCLONE_UNAVAILABLE;
+ child->pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -246,6 +291,7 @@ EXPORT_SYMBOL(__alloc_skb);
/**
* build_skb - build a network buffer
* @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc()
@@ -259,20 +305,21 @@ EXPORT_SYMBOL(__alloc_skb);
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
- unsigned int size;
+ unsigned int size = frag_size ? : ksize(data);
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
return NULL;
- size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
+ skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -292,6 +339,63 @@ struct sk_buff *build_skb(void *data)
}
EXPORT_SYMBOL(build_skb);
+struct netdev_alloc_cache {
+ struct page *page;
+ unsigned int offset;
+ unsigned int pagecnt_bias;
+};
+static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+
+#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct netdev_alloc_cache *nc;
+ void *data = NULL;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ nc = &__get_cpu_var(netdev_alloc_cache);
+ if (unlikely(!nc->page)) {
+refill:
+ nc->page = alloc_page(gfp_mask);
+ if (unlikely(!nc->page))
+ goto end;
+recycle:
+ atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS);
+ nc->pagecnt_bias = NETDEV_PAGECNT_BIAS;
+ nc->offset = 0;
+ }
+
+ if (nc->offset + fragsz > PAGE_SIZE) {
+ /* avoid unnecessary locked operations if possible */
+ if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) ||
+ atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count))
+ goto recycle;
+ goto refill;
+ }
+
+ data = page_address(nc->page) + nc->offset;
+ nc->offset += fragsz;
+ nc->pagecnt_bias--;
+end:
+ local_irq_restore(flags);
+ return data;
+}
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+void *netdev_alloc_frag(unsigned int fragsz)
+{
+ return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(netdev_alloc_frag);
+
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
@@ -306,11 +410,29 @@ EXPORT_SYMBOL(build_skb);
* %NULL is returned if there is no free memory.
*/
struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
+ unsigned int length, gfp_t gfp_mask)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ void *data;
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = __netdev_alloc_frag(fragsz, gfp_mask);
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+ if (likely(data)) {
+ skb = build_skb(data, fragsz);
+ if (unlikely(!skb))
+ put_page(virt_to_head_page(data));
+ }
+ } else {
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
+ }
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
@@ -329,28 +451,6 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
}
EXPORT_SYMBOL(skb_add_rx_frag);
-/**
- * dev_alloc_skb - allocate an skbuff for receiving
- * @length: length to allocate
- *
- * Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has unspecified headroom built in. Users should allocate
- * the headroom they think they need without accounting for the
- * built in space. The built in space is used for optimisations.
- *
- * %NULL is returned if there is no free memory. Although this function
- * allocates memory it can be called from an interrupt.
- */
-struct sk_buff *dev_alloc_skb(unsigned int length)
-{
- /*
- * There is more code here than it seems:
- * __dev_alloc_skb is an inline
- */
- return __dev_alloc_skb(length, GFP_ATOMIC);
-}
-EXPORT_SYMBOL(dev_alloc_skb);
-
static void skb_drop_list(struct sk_buff **listp)
{
struct sk_buff *list = *listp;
@@ -377,6 +477,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
@@ -403,7 +511,7 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
- kfree(skb->head);
+ skb_free_head(skb);
}
}
@@ -605,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#if IS_ENABLED(CONFIG_IP_VS)
new->ipvs_property = old->ipvs_property;
#endif
+ new->pfmemalloc = old->pfmemalloc;
new->protocol = old->protocol;
new->mark = old->mark;
new->skb_iif = old->skb_iif;
@@ -645,6 +754,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
C(tail);
C(end);
C(head);
+ C(head_frag);
C(data);
C(truesize);
atomic_set(&n->users, 1);
@@ -673,7 +783,8 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
-/* skb_copy_ubufs - copy userspace skb frags buffers to kernel
+/**
+ * skb_copy_ubufs - copy userspace skb frags buffers to kernel
* @skb: the skb to modify
* @gfp_mask: allocation priority
*
@@ -698,7 +809,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
u8 *vaddr;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- page = alloc_page(GFP_ATOMIC);
+ page = alloc_page(gfp_mask);
if (!page) {
while (head) {
struct page *next = (struct page *)head->private;
@@ -707,31 +818,31 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
}
return -ENOMEM;
}
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ vaddr = kmap_atomic(skb_frag_page(f));
memcpy(page_address(page),
vaddr + f->page_offset, skb_frag_size(f));
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
page->private = (unsigned long)head;
head = page;
}
/* skb frags release userspace buffers */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ for (i = 0; i < num_frags; i++)
skb_frag_unref(skb, i);
uarg->callback(uarg);
/* skb frags point to kernel buffers */
- for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
- __skb_fill_page_desc(skb, i-1, head, 0,
- skb_shinfo(skb)->frags[i - 1].size);
+ for (i = num_frags - 1; i >= 0; i--) {
+ __skb_fill_page_desc(skb, i, head, 0,
+ skb_shinfo(skb)->frags[i].size);
head = (struct page *)head->private;
}
skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
return 0;
}
-
+EXPORT_SYMBOL_GPL(skb_copy_ubufs);
/**
* skb_clone - duplicate an sk_buff
@@ -751,10 +862,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *n;
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, gfp_mask))
- return NULL;
- }
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
@@ -763,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->fclone = SKB_FCLONE_CLONE;
atomic_inc(fclone_ref);
} else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
@@ -799,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
+
/**
* skb_copy - create private copy of an sk_buff
* @skb: buffer to copy
@@ -819,8 +938,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
- unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ unsigned int size = skb_end_offset(skb) + skb->data_len;
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -855,7 +975,8 @@ EXPORT_SYMBOL(skb_copy);
struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
{
unsigned int size = skb_headlen(skb) + headroom;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
goto out;
@@ -874,12 +995,10 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
if (skb_shinfo(skb)->nr_frags) {
int i;
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, gfp_mask)) {
- kfree_skb(n);
- n = NULL;
- goto out;
- }
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
@@ -920,9 +1039,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
{
int i;
u8 *data;
- int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
+ int size = nhead + skb_end_offset(skb) + ntail;
long off;
- bool fastpath;
BUG_ON(nhead < 0);
@@ -931,29 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
size = SKB_DATA_ALIGN(size);
- /* Check if we can avoid taking references on fragments if we own
- * the last reference on skb->head. (see skb_release_data())
- */
- if (!skb->cloned)
- fastpath = true;
- else {
- int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
- fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
- }
-
- if (fastpath &&
- size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
- memmove(skb->head + size, skb_shinfo(skb),
- offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
- memmove(skb->head + nhead, skb->head,
- skb_tail_pointer(skb) - skb->head);
- off = nhead;
- goto adjust_others;
- }
-
- data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
size = SKB_WITH_OVERHEAD(ksize(data));
@@ -967,14 +1066,15 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb_shinfo(skb),
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
- if (fastpath) {
- kfree(skb->head);
- } else {
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
/* copy this zero copy skb frags */
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, gfp_mask))
- goto nofrags;
- }
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
@@ -982,11 +1082,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb_clone_fraglist(skb);
skb_release_data(skb);
+ } else {
+ skb_free_head(skb);
}
off = (data + nhead) - skb->head;
skb->head = data;
-adjust_others:
+ skb->head_frag = 0;
skb->data += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->end = size;
@@ -1062,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask);
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
int off;
@@ -1275,7 +1378,7 @@ drop_pages:
return -ENOMEM;
nfrag->next = frag->next;
- kfree_skb(frag);
+ consume_skb(frag);
frag = nfrag;
*fragp = frag;
}
@@ -1487,21 +1590,22 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ vaddr = kmap_atomic(skb_frag_page(f));
memcpy(to,
- vaddr + skb_shinfo(skb)->frags[i].page_offset+
- offset - start, copy);
- kunmap_skb_frag(vaddr);
+ vaddr + f->page_offset + offset - start,
+ copy);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
@@ -1547,9 +1651,9 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
put_page(spd->pages[i]);
}
-static inline struct page *linear_to_page(struct page *page, unsigned int *len,
- unsigned int *offset,
- struct sk_buff *skb, struct sock *sk)
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sk_buff *skb, struct sock *sk)
{
struct page *p = sk->sk_sndmsg_page;
unsigned int off;
@@ -1565,6 +1669,9 @@ new_page:
} else {
unsigned int mlen;
+ /* If we are the only user of the page, we can reset offset */
+ if (page_count(p) == 1)
+ sk->sk_sndmsg_off = 0;
off = sk->sk_sndmsg_off;
mlen = PAGE_SIZE - off;
if (mlen < 64 && mlen < *len) {
@@ -1578,36 +1685,48 @@ new_page:
memcpy(page_address(p) + off, page_address(page) + *offset, *len);
sk->sk_sndmsg_off += *len;
*offset = off;
- get_page(p);
return p;
}
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
+}
+
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static inline int spd_fill_page(struct splice_pipe_desc *spd,
- struct pipe_inode_info *pipe, struct page *page,
- unsigned int *len, unsigned int offset,
- struct sk_buff *skb, int linear,
- struct sock *sk)
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
+ unsigned int *len, unsigned int offset,
+ struct sk_buff *skb, bool linear,
+ struct sock *sk)
{
- if (unlikely(spd->nr_pages == pipe->buffers))
- return 1;
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
if (linear) {
page = linear_to_page(page, len, &offset, skb, sk);
if (!page)
- return 1;
- } else
- get_page(page);
-
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
spd->pages[spd->nr_pages] = page;
spd->partial[spd->nr_pages].len = *len;
spd->partial[spd->nr_pages].offset = offset;
spd->nr_pages++;
- return 0;
+ return false;
}
static inline void __segment_seek(struct page **page, unsigned int *poff,
@@ -1624,20 +1743,20 @@ static inline void __segment_seek(struct page **page, unsigned int *poff,
*plen -= off;
}
-static inline int __splice_segment(struct page *page, unsigned int poff,
- unsigned int plen, unsigned int *off,
- unsigned int *len, struct sk_buff *skb,
- struct splice_pipe_desc *spd, int linear,
- struct sock *sk,
- struct pipe_inode_info *pipe)
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len, struct sk_buff *skb,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
{
if (!*len)
- return 1;
+ return true;
/* skip this segment if already processed */
if (*off >= plen) {
*off -= plen;
- return 0;
+ return false;
}
/* ignore any bits we already processed */
@@ -1653,34 +1772,38 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
- return 1;
+ return true;
__segment_seek(&page, &poff, &plen, flen);
*len -= flen;
} while (*len && plen);
- return 0;
+ return false;
}
/*
- * Map linear and fragment data from the skb to spd. It reports failure if the
+ * Map linear and fragment data from the skb to spd. It reports true if the
* pipe is full or if we already spliced the requested length.
*/
-static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
- unsigned int *offset, unsigned int *len,
- struct splice_pipe_desc *spd, struct sock *sk)
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
- /*
- * map the linear part
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
*/
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
- offset, len, skb, spd, 1, sk, pipe))
- return 1;
+ offset, len, skb, spd,
+ skb_head_is_locked(skb),
+ sk, pipe))
+ return true;
/*
* then map the fragments
@@ -1690,11 +1813,11 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
if (__splice_segment(skb_frag_page(f),
f->page_offset, skb_frag_size(f),
- offset, len, skb, spd, 0, sk, pipe))
- return 1;
+ offset, len, skb, spd, false, sk, pipe))
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -1707,11 +1830,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
unsigned int flags)
{
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
+ .nr_pages_max = MAX_SKB_FRAGS,
.flags = flags,
.ops = &sock_pipe_buf_ops,
.spd_release = sock_spd_release,
@@ -1720,9 +1844,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
struct sock *sk = skb->sk;
int ret = 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
/*
* __skb_splice_bits() only fails if the output has no room left,
* so no point in going over the frag_list for the error case.
@@ -1758,7 +1879,6 @@ done:
lock_sock(sk);
}
- splice_shrink_spd(pipe, &spd);
return ret;
}
@@ -1806,10 +1926,10 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
memcpy(vaddr + frag->page_offset + offset - start,
from, copy);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
@@ -1869,21 +1989,21 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
__wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
csum2 = csum_partial(vaddr + frag->page_offset +
offset - start, copy, 0);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
if (!(len -= copy))
return csum;
@@ -1955,12 +2075,12 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
csum2 = csum_partial_copy_nocheck(vaddr +
frag->page_offset +
offset - start, to,
copy, 0);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
if (!(len -= copy))
return csum;
@@ -2480,7 +2600,7 @@ next_skb:
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_skb_frag(frag);
+ st->frag_data = kmap_atomic(skb_frag_page(frag));
*data = (u8 *) st->frag_data + frag->page_offset +
(abs_offset - st->stepped_offset);
@@ -2489,7 +2609,7 @@ next_skb:
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
@@ -2498,7 +2618,7 @@ next_skb:
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
@@ -2526,7 +2646,7 @@ EXPORT_SYMBOL(skb_seq_read);
void skb_abort_seq_read(struct skb_seq_state *st)
{
if (st->frag_data)
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);
@@ -2574,7 +2694,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
EXPORT_SYMBOL(skb_find_text);
/**
- * skb_append_datato_frags: - append the user data to a skb
+ * skb_append_datato_frags - append the user data to a skb
* @sk: sock structure
* @skb: skb structure to be appened with user data.
* @getfrag: call back function to be used for getting the user data
@@ -2718,19 +2838,19 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
if (unlikely(!nskb))
goto err;
- hsize = skb_end_pointer(nskb) - nskb->head;
+ hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) {
kfree_skb(nskb);
goto err;
}
- nskb->truesize += skb_end_pointer(nskb) - nskb->head -
- hsize;
+ nskb->truesize += skb_end_offset(nskb) - hsize;
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
- nskb = alloc_skb(hsize + doffset + headroom,
- GFP_ATOMIC);
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
@@ -2843,6 +2963,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
unsigned int len = skb_gro_len(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
+ unsigned int delta_truesize;
if (p->len + len >= 65536)
return -E2BIG;
@@ -2872,11 +2993,41 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
frag->page_offset += offset;
skb_frag_size_sub(frag, offset);
+ /* all fragments truesize : remove (head size + sk_buff) */
+ delta_truesize = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
skb->data_len = 0;
- NAPI_GRO_CB(skb)->free = 1;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ return -E2BIG;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ frag->page.p = page;
+ frag->page_offset = first_offset;
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
} else if (skb_gro_len(p) != pinfo->gso_size)
return -E2BIG;
@@ -2918,7 +3069,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
p = nskb;
merge:
- p->truesize += skb->truesize - len;
+ delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;
@@ -2938,7 +3089,7 @@ merge:
done:
NAPI_GRO_CB(p)->count++;
p->data_len += len;
- p->truesize += len;
+ p->truesize += delta_truesize;
p->len += len;
NAPI_GRO_CB(skb)->same_flow = 1;
@@ -3166,7 +3317,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
int len = skb->len;
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf)
+ (unsigned int)sk->sk_rcvbuf)
return -ENOMEM;
skb_orphan(skb);
@@ -3260,10 +3411,8 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
if (unlikely(start > skb_headlen(skb)) ||
unlikely((int)start + off > skb_headlen(skb) - 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "bad partial csum: csum=%u/%u len=%u\n",
- start, off, skb_headlen(skb));
+ net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
+ start, off, skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -3275,8 +3424,93 @@ EXPORT_SYMBOL_GPL(skb_partial_csum_set);
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
- if (net_ratelimit())
- pr_warning("%s: received packets cannot be forwarded"
- " while LRO is enabled\n", skb->dev->name);
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen)
+ kmem_cache_free(skbuff_head_cache, skb);
+ else
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(kfree_skb_partial);
+
+/**
+ * skb_try_coalesce - try to merge skb to prior one
+ * @to: prior buffer
+ * @from: buffer to add
+ * @fragstolen: pointer to boolean
+ * @delta_truesize: how much more was allocated than was requested
+ */
+bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ bool *fragstolen, int *delta_truesize)
+{
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (skb_cloned(to))
+ return false;
+
+ if (len <= skb_tailroom(to)) {
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ *delta_truesize = 0;
+ return true;
+ }
+
+ if (skb_has_frag_list(to) || skb_has_frag_list(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize -
+ SKB_TRUESIZE(skb_end_pointer(from) - from->head);
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+ skb_shinfo(from)->frags,
+ skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+
+ if (!skb_cloned(from))
+ skb_shinfo(from)->nr_frags = 0;
+
+ /* if the skb is cloned this does nothing since we set nr_frags to 0 */
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+ skb_frag_ref(from, i);
+
+ to->truesize += delta;
+ to->len += len;
+ to->data_len += len;
+
+ *delta_truesize = delta;
+ return true;
+}
+EXPORT_SYMBOL(skb_try_coalesce);