From f982f91516fa4cfd9d20518833cd04ad714585be Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Tue, 21 Jun 2011 22:09:50 +0200 Subject: mm: fix wrong vmap address calculations with odd NR_CPUS values Commit db64fe02258f ("mm: rewrite vmap layer") introduced code that does address calculations under the assumption that VMAP_BLOCK_SIZE is a power of two. However, this might not be true if CONFIG_NR_CPUS is not set to a power of two. Wrong vmap_block index/offset values could lead to memory corruption. However, this has never been observed in practice (or never been diagnosed correctly); what caught this was the BUG_ON in vb_alloc() that checks for inconsistent vmap_block indices. To fix this, ensure that VMAP_BLOCK_SIZE always is a power of two. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=31572 Reported-by: Pavel Kysilka Reported-by: Matias A. Fonzo Signed-off-by: Clemens Ladisch Signed-off-by: Stefan Richter Cc: Nick Piggin Cc: Jeremy Fitzhardinge Cc: Krzysztof Helt Cc: Andrew Morton Cc: 2.6.28+ Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 464621d18eb2..7ef0903058ee 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ -#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ - VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ - VMALLOC_PAGES / NR_CPUS / 16)) +#define VMAP_BBMAP_BITS \ + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) -- cgit v1.2.3 From 461ae488ecb125b140d7ea29ceeedbcce9327003 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 14 Sep 2011 16:22:02 -0700 Subject: mm: sync vmalloc address space page tables in alloc_vm_area() Xen backend drivers (e.g., blkback and netback) would sometimes fail to map grant pages into the vmalloc address space allocated with alloc_vm_area(). The GNTTABOP_map_grant_ref would fail because Xen could not find the page (in the L2 table) containing the PTEs it needed to update. (XEN) mm.c:3846:d0 Could not find L1 PTE for address fbb42000 netback and blkback were making the hypercall from a kernel thread where task->active_mm != &init_mm and alloc_vm_area() was only updating the page tables for init_mm. The usual method of deferring the update to the page tables of other processes (i.e., after taking a fault) doesn't work as a fault cannot occur during the hypercall. This would work on some systems depending on what else was using vmalloc. Fix this by reverting ef691947d8a3 ("vmalloc: remove vmalloc_sync_all() from alloc_vm_area()") and add a comment to explain why it's needed. Signed-off-by: David Vrabel Cc: Jeremy Fitzhardinge Cc: Konrad Rzeszutek Wilk Cc: Ian Campbell Cc: Keir Fraser Cc: [3.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ef0903058ee..5016f19e1661 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2140,6 +2140,14 @@ struct vm_struct *alloc_vm_area(size_t size) return NULL; } + /* + * If the allocated address space is passed to a hypercall + * before being used then we cannot rely on a page fault to + * trigger an update of the page tables. So sync all the page + * tables here. + */ + vmalloc_sync_all(); + return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); -- cgit v1.2.3 From f5252e009d5b87071a919221e4f6624184005368 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Mon, 31 Oct 2011 17:08:13 -0700 Subject: mm: avoid null pointer access in vm_struct via /proc/vmallocinfo The /proc/vmallocinfo shows information about vmalloc allocations in vmlist that is a linklist of vm_struct. It, however, may access pages field of vm_struct where a page was not allocated. This results in a null pointer access and leads to a kernel panic. Why this happens: In __vmalloc_node_range() called from vmalloc(), newly allocated vm_struct is added to vmlist at __get_vm_area_node() and then, some fields of vm_struct such as nr_pages and pages are set at __vmalloc_area_node(). In other words, it is added to vmlist before it is fully initialized. At the same time, when the /proc/vmallocinfo is read, it accesses the pages field of vm_struct according to the nr_pages field at show_numa_info(). Thus, a null pointer access happens. The patch adds the newly allocated vm_struct to the vmlist *after* it is fully initialized. So, it can avoid accessing the pages field with unallocated page when show_numa_info() is called. Signed-off-by: Mitsuo Hayasaka Cc: Andrew Morton Cc: David Rientjes Cc: Namhyung Kim Cc: "Paul E. McKenney" Cc: Jeremy Fitzhardinge Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 65 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 49 insertions(+), 17 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 9332e52ea8c2..687fb11e2010 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -13,6 +13,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #define VM_MAP 0x00000004 /* vmap()ed pages */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ +#define VM_UNLIST 0x00000020 /* vm_struct is not listed in vmlist */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5016f19e1661..56faf3163ee2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1253,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, void *caller) { - struct vm_struct *tmp, **p; - vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->private = vm; va->flags |= VM_VM_AREA; +} + +static void insert_vmalloc_vmlist(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + vm->flags &= ~VM_UNLIST; write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) @@ -1275,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, write_unlock(&vmlist_lock); } +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + setup_vmalloc_vm(vm, va, flags, caller); + insert_vmalloc_vmlist(vm); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) @@ -1313,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - insert_vmalloc_vm(area, va, flags, caller); + /* + * When this function is called from __vmalloc_node_range, + * we do not add vm_struct to vmlist here to avoid + * accessing uninitialized members of vm_struct such as + * pages and nr_pages fields. They will be set later. + * To distinguish it from others, we use a VM_UNLIST flag. + */ + if (flags & VM_UNLIST) + setup_vmalloc_vm(area, va, flags, caller); + else + insert_vmalloc_vm(area, va, flags, caller); + return area; } @@ -1381,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; - struct vm_struct *tmp, **p; - /* - * remove from list and disallow access to this vm_struct - * before unmap. (address range confliction is maintained by - * vmap.) - */ - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) - ; - *p = tmp->next; - write_unlock(&vmlist_lock); + + if (!(vm->flags & VM_UNLIST)) { + struct vm_struct *tmp, **p; + /* + * remove from list and disallow access to + * this vm_struct before unmap. (address range + * confliction is maintained by vmap.) + */ + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + } vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); @@ -1602,14 +1627,20 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, - gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + start, end, node, gfp_mask, caller); if (!area) return NULL; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + /* + * In this function, newly allocated vm_struct is not added + * to vmlist at __get_vm_area_node(). so, it is added here. + */ + insert_vmalloc_vmlist(area); + /* * A ref_count = 3 is needed because the vm_struct and vmap_area * structures allocated in the __get_vm_area_node() function contain -- cgit v1.2.3 From 3ee9a4f086716d792219c021e8509f91165a4128 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 31 Oct 2011 17:08:35 -0700 Subject: mm: neaten warn_alloc_failed Add __attribute__((format (printf...) to the function to validate format and arguments. Use vsprintf extension %pV to avoid any possible message interleaving. Coalesce format string. Convert printks/pr_warning to pr_warn. [akpm@linux-foundation.org: use the __printf() macro] Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- mm/page_alloc.c | 16 +++++++++++----- mm/vmalloc.c | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7438071b44aa..3b3e3b8bb706 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1334,7 +1334,8 @@ extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; -extern void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); +extern __printf(3, 4) +void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); extern void setup_per_cpu_pageset(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83a02052bce4..9dd443d89d8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1754,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { - va_list args; unsigned int filter = SHOW_MEM_FILTER_NODES; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) @@ -1773,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { - printk(KERN_WARNING); + struct va_format vaf; + va_list args; + va_start(args, fmt); - vprintk(fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("%pV", &vaf); + va_end(args); } - pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); + pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 56faf3163ee2..08ab0aa1406c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1593,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " - "allocated %ld of %ld bytes\n", + warn_alloc_failed(gfp_mask, order, + "vmalloc: allocation failure, allocated %ld of %ld bytes\n", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; -- cgit v1.2.3 From de7d2b567d040e3b67fe7121945982f14343213d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 31 Oct 2011 17:08:48 -0700 Subject: mm/vmalloc.c: report more vmalloc failures Some vmalloc failure paths do not report OOM conditions. Add warn_alloc_failed, which also does a dump_stack, to those failure paths. This allows more site specific vmalloc failure logging message printks to be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 08ab0aa1406c..b669aa6f6caf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1625,13 +1625,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages) - return NULL; + goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, start, end, node, gfp_mask, caller); - if (!area) - return NULL; + goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); @@ -1649,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, kmemleak_alloc(addr, real_size, 3, gfp_mask); return addr; + +fail: + warn_alloc_failed(gfp_mask, 0, + "vmalloc: allocation failure: %lu bytes\n", + real_size); + return NULL; } /** -- cgit v1.2.3