From f557206800801410c30e53ce7a27219b2c4cf0ba Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 14 Jan 2009 20:40:03 -0800 Subject: gro: Fix page ref count for skbs freed normally When an skb with page frags is merged into an existing one, we cannibalise its reference count. This is OK when the skb is reused because we set nr_frags to zero in that case. However, for the case where the skb is freed through kfree_skb, we didn't clear nr_frags which causes the page to be freed prematurely. This is fixed by moving the skb resetting into skb_gro_receive. Reported-by: Jeff Kirsher Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5110b359c758..65eac7739033 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2602,6 +2602,12 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + + skb->truesize -= skb->data_len; + skb->len -= skb->data_len; + skb->data_len = 0; + NAPI_GRO_CB(skb)->free = 1; goto done; } -- cgit v1.2.3 From 8b9d3728977760f6bd1317c4420890f73695354e Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Mon, 19 Jan 2009 17:03:56 -0800 Subject: net: Fix data corruption when splicing from sockets. The trick in socket splicing where we try to convert the skb->data into a page based reference using virt_to_page() does not work so well. The idea is to pass the virt_to_page() reference via the pipe buffer, and refcount the buffer using a SKB reference. But if we are splicing from a socket to a socket (via sendpage) this doesn't work. The from side processing will grab the page (and SKB) references. The sendpage() calls will grab page references only, return, and then the from side processing completes and drops the SKB ref. The page based reference to skb->data is not enough to keep the kmalloc() buffer backing it from being reused. Yet, that is all that the socket send side has at this point. This leads to data corruption if the skb->data buffer is reused by SLAB before the send side socket actually gets the TX packet out to the device. The fix employed here is to simply allocate a page and copy the skb->data bytes into that page. This will hurt performance, but there is no clear way to fix this properly without a copy at the present time, and it is important to get rid of the data corruption. With fixes from Herbert Xu. Tested-by: Willy Tarreau Foreseen-by: Changli Gao Diagnosed-by: Willy Tarreau Reported-by: Willy Tarreau Fixed-by: Jens Axboe Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 61 ++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 65eac7739033..56272ac6dfd8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,17 +73,13 @@ static struct kmem_cache *skbuff_fclone_cache __read_mostly; static void sock_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct sk_buff *skb = (struct sk_buff *) buf->private; - - kfree_skb(skb); + put_page(buf->page); } static void sock_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct sk_buff *skb = (struct sk_buff *) buf->private; - - skb_get(skb); + get_page(buf->page); } static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -1334,9 +1330,19 @@ fault: */ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) { - struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; + put_page(spd->pages[i]); +} - kfree_skb(skb); +static inline struct page *linear_to_page(struct page *page, unsigned int len, + unsigned int offset) +{ + struct page *p = alloc_pages(GFP_KERNEL, 0); + + if (!p) + return NULL; + memcpy(page_address(p) + offset, page_address(page) + offset, len); + + return p; } /* @@ -1344,16 +1350,23 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, unsigned int len, unsigned int offset, - struct sk_buff *skb) + struct sk_buff *skb, int linear) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; + if (linear) { + page = linear_to_page(page, len, offset); + if (!page) + return 1; + } else + get_page(page); + spd->pages[spd->nr_pages] = page; spd->partial[spd->nr_pages].len = len; spd->partial[spd->nr_pages].offset = offset; - spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb); spd->nr_pages++; + return 0; } @@ -1369,7 +1382,7 @@ static inline void __segment_seek(struct page **page, unsigned int *poff, static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd) + struct splice_pipe_desc *spd, int linear) { if (!*len) return 1; @@ -1392,7 +1405,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, flen, poff, skb)) + if (spd_fill_page(spd, page, flen, poff, skb, linear)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1419,7 +1432,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd)) + offset, len, skb, spd, 1)) return 1; /* @@ -1429,7 +1442,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd)) + offset, len, skb, spd, 0)) return 1; } @@ -1442,7 +1455,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { @@ -1455,16 +1468,6 @@ int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *skb; - - /* - * I'd love to avoid the clone here, but tcp_read_sock() - * ignores reference counts and unconditonally kills the sk_buff - * on return from the actor. - */ - skb = skb_clone(__skb, GFP_KERNEL); - if (unlikely(!skb)) - return -ENOMEM; /* * __skb_splice_bits() only fails if the output has no room left, @@ -1488,15 +1491,9 @@ int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, } done: - /* - * drop our reference to the clone, the pipe consumption will - * drop the rest. - */ - kfree_skb(skb); - if (spd.nr_pages) { + struct sock *sk = skb->sk; int ret; - struct sock *sk = __skb->sk; /* * Drop the socket lock, otherwise we have reverse -- cgit v1.2.3 From 37fe4732b978eb02e5433387a40f2b61706cebe3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jan 2009 19:48:13 +0000 Subject: gro: Fix merging of paged packets The previous fix to paged packets broke the merging because it reset the skb->len before we added it to the merged packet. This wasn't detected because it simply resulted in the truncation of the packet while the missing bit is subsequently retransmitted. The fix is to store skb->len before we clobber it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 56272ac6dfd8..2e5f2ca3bdcd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2585,8 +2585,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *nskb; unsigned int headroom; unsigned int hlen = p->data - skb_mac_header(p); + unsigned int len = skb->len; - if (hlen + p->len + skb->len >= 65536) + if (hlen + p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) @@ -2648,9 +2649,9 @@ merge: done: NAPI_GRO_CB(p)->count++; - p->data_len += skb->len; - p->truesize += skb->len; - p->len += skb->len; + p->data_len += len; + p->truesize += len; + p->len += len; NAPI_GRO_CB(skb)->same_flow = 1; return 0; -- cgit v1.2.3 From 95e3b24cfb4ec0479d2c42f7a1780d68063a542a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 16:07:52 -0800 Subject: net: Fix frag_list handling in skb_seq_read The frag_list handling was broken in skb_seq_read: 1) We didn't add the stepped offset when looking at the head are of fragments other than the first. 2) We didn't take the stepped offset away when setting the data pointer in the head area. 3) The frag index wasn't reset. This patch fixes both issues. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2e5f2ca3bdcd..f23fd43539ed 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2212,10 +2212,10 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, return 0; next_skb: - block_limit = skb_headlen(st->cur_skb); + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; if (abs_offset < block_limit) { - *data = st->cur_skb->data + abs_offset; + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } @@ -2257,6 +2257,7 @@ next_skb: } else if (st->root_skb == st->cur_skb && skb_shinfo(st->root_skb)->frag_list) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + st->frag_idx = 0; goto next_skb; } -- cgit v1.2.3 From 71b3346d182355f19509fadb8fe45114a35cc499 Mon Sep 17 00:00:00 2001 From: Shyam Iyer Date: Thu, 29 Jan 2009 16:12:42 -0800 Subject: net: Fix OOPS in skb_seq_read(). It oopsd for me in skb_seq_read. addr2line said it was linux-2.6/net/core/skbuff.c:2228, which is this line: while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { I added some printks in there and it looks like we hit this: } else if (st->root_skb == st->cur_skb && skb_shinfo(st->root_skb)->frag_list) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; } Actually I did some testing and added a few printks and found that the st->cur_skb->data was 0 and hence the ptr used by iscsi_tcp was null. This caused the kernel panic. if (abs_offset < block_limit) { - *data = st->cur_skb->data + abs_offset; + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); I enabled the debug_tcp and with a few printks found that the code did not go to the next_skb label and could find that the sequence being followed was this - It hit this if condition - if (st->cur_skb->next) { st->cur_skb = st->cur_skb->next; st->frag_idx = 0; goto next_skb; And so, now the st pointer is shifted to the next skb whereas actually it should have hit the second else if first since the data is in the frag_list. else if (st->root_skb == st->cur_skb && skb_shinfo(st->root_skb)->frag_list) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; goto next_skb; } Reversing the two conditions the attached patch fixes the issue for me on top of Herbert's patches. Signed-off-by: Shyam Iyer Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f23fd43539ed..da74b844f4ea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2250,13 +2250,13 @@ next_skb: st->frag_data = NULL; } - if (st->cur_skb->next) { - st->cur_skb = st->cur_skb->next; + if (st->root_skb == st->cur_skb && + skb_shinfo(st->root_skb)->frag_list) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; - } else if (st->root_skb == st->cur_skb && - skb_shinfo(st->root_skb)->frag_list) { - st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; st->frag_idx = 0; goto next_skb; } -- cgit v1.2.3 From 92a0acce186cde8ead56c6915d9479773673ea1a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Feb 2009 21:24:05 -0800 Subject: net: Kill skb_truesize_check(), it only catches false-positives. A long time ago we had bugs, primarily in TCP, where we would modify skb->truesize (for TSO queue collapsing) in ways which would corrupt the socket memory accounting. skb_truesize_check() was added in order to try and catch this error more systematically. However this debugging check has morphed into a Frankenstein of sorts and these days it does nothing other than catch false-positives. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 --------- include/net/sock.h | 1 - net/core/skbuff.c | 8 -------- net/core/sock.c | 1 - 4 files changed, 19 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cf2cb50f77d1..9dcf956ad18a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -416,15 +416,6 @@ extern void skb_over_panic(struct sk_buff *skb, int len, void *here); extern void skb_under_panic(struct sk_buff *skb, int len, void *here); -extern void skb_truesize_bug(struct sk_buff *skb); - -static inline void skb_truesize_check(struct sk_buff *skb) -{ - int len = sizeof(struct sk_buff) + skb->len; - - if (unlikely((int)skb->truesize < len)) - skb_truesize_bug(skb); -} extern int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, int getfrag(void *from, char *to, int offset, diff --git a/include/net/sock.h b/include/net/sock.h index ce3b5b622683..eefeeaf7fc46 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -860,7 +860,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { - skb_truesize_check(skb); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index da74b844f4ea..c6a6b166f8d6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -143,14 +143,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) BUG(); } -void skb_truesize_bug(struct sk_buff *skb) -{ - WARN(net_ratelimit(), KERN_ERR "SKB BUG: Invalid truesize (%u) " - "len=%u, sizeof(sk_buff)=%Zd\n", - skb->truesize, skb->len, sizeof(struct sk_buff)); -} -EXPORT_SYMBOL(skb_truesize_bug); - /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. diff --git a/net/core/sock.c b/net/core/sock.c index 6f2e1337975d..6e4f14d1ef81 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1137,7 +1137,6 @@ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; - skb_truesize_check(skb); atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(skb->sk, skb->truesize); } -- cgit v1.2.3