From 6ca27b795b43a5dfef98c970da9ca7b12a71ba72 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Fri, 30 Jan 2015 13:11:14 +1100 Subject: input: route kbd LEDs through the generic LEDs layer This permits to reassign keyboard LEDs to something else than keyboard "leds" state, by adding keyboard led and modifier triggers connected to a series of VT input LEDs, themselves connected to VT input triggers, which per-input device LEDs use by default. Userland can thus easily change the LED behavior of (a priori) all input devices, or of particular input devices. This also permits to fix #7063 from userland by using a modifier to implement proper CapsLock behavior and have the keyboard caps lock led show that modifier state. [ebroder@mokafive.com: Rebased to 3.2-rc1 or so, cleaned up some includes, and fixed some constants] [blogic@openwrt.org: CONFIG_INPUT_LEDS stubs should be static inline] [akpm@linux-foundation.org: remove unneeded `extern', fix comment layout] Signed-off-by: Samuel Thibault Signed-off-by: Evan Broder Signed-off-by: John Crispin Reviewed-by: David Herrmann Tested-by: Pavel Machek Acked-by: Peter Korsgaard Cc: Pavel Machek Cc: Dmitry Torokhov Cc: Bryan Wu Cc: Arnaud Patard Cc: Sascha Hauer Cc: Matt Sealey Cc: Rob Clark Cc: Niels de Vos Cc: Steev Klimaszewski Signed-off-by: Andrew Morton --- include/linux/input.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/input.h b/include/linux/input.h index 82ce323b9986..3b4c32f7312a 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -79,6 +79,7 @@ struct input_value { * @led: reflects current state of device's LEDs * @snd: reflects current state of sound effects * @sw: reflects current state of device's switches + * @leds: leds objects for the device's LEDs * @open: this method is called when the very first user calls * input_open_device(). The driver must prepare the device * to start generating events (start polling thread, @@ -164,6 +165,8 @@ struct input_dev { unsigned long snd[BITS_TO_LONGS(SND_CNT)]; unsigned long sw[BITS_TO_LONGS(SW_CNT)]; + struct led_classdev *leds; + int (*open)(struct input_dev *dev); void (*close)(struct input_dev *dev); int (*flush)(struct input_dev *dev, struct file *file); @@ -531,4 +534,29 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file); int input_ff_create_memless(struct input_dev *dev, void *data, int (*play_effect)(struct input_dev *, void *, struct ff_effect *)); +#ifdef CONFIG_INPUT_LEDS + +void input_led_init(void); +void input_led_exit(void); + +int input_led_connect(struct input_dev *dev); +void input_led_disconnect(struct input_dev *dev); + +#else + +static inline void input_led_init(void) { } + +static inline void input_led_exit(void) { } + +static inline int input_led_connect(struct input_dev *dev) +{ + return 0; +} + +static inline void input_led_disconnect(struct input_dev *dev) +{ +} + +#endif + #endif -- cgit v1.2.3 From 08e904dd6043dd95f3642b6f8da04f35efd94e3c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:20 +1100 Subject: mm: don't use compound_head() in virt_to_head_page() compound_head() is implemented with assumption that there would be race condition when checking tail flag. This assumption is only true when we try to access arbitrary positioned struct page. The situation that virt_to_head_page() is called is different case. We call virt_to_head_page() only in the range of allocated pages, so there is no race condition on tail flag. In this case, we don't need to handle race condition and we can reduce overhead slightly. This patch implements compound_head_fast() which is similar with compound_head() except tail flag race handling. And then, virt_to_head_page() uses this optimized function to improve performance. I saw 1.8% win in a fast-path loop over kmem_cache_alloc/free, (14.063 ns -> 13.810 ns) if target object is on tail page. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index dd5ea3016fc4..ecc7e377c347 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -453,6 +453,13 @@ static inline struct page *compound_head(struct page *page) return page; } +static inline struct page *compound_head_fast(struct page *page) +{ + if (unlikely(PageTail(page))) + return page->first_page; + return page; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -531,7 +538,8 @@ static inline void get_page(struct page *page) static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); - return compound_head(page); + + return compound_head_fast(page); } /* -- cgit v1.2.3 From 0dd911986cb4c6a41278f9e29b6ed6638c2c79d5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:21 +1100 Subject: mm: don't use compound_head() in virt_to_head_page() Change from v2: Add some code comments Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecc7e377c347..2c6fd3c5424a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -446,6 +446,12 @@ static inline struct page *compound_head_by_tail(struct page *tail) return tail; } +/* + * Since either compound page could be dismantled asynchronously in THP + * or we access asynchronously arbitrary positioned struct page, there + * would be tail flag race. To handle this race, we should call + * smp_rmb() before checking tail flag. compound_head_by_tail() did it. + */ static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) @@ -453,6 +459,11 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * If we access compound page synchronously such as access to + * allocated page, there is no need to handle tail flag race, so we can + * check tail flag directly without any synchronization primitive. + */ static inline struct page *compound_head_fast(struct page *page) { if (unlikely(PageTail(page))) @@ -539,6 +550,12 @@ static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); + /* + * We don't need to worry about synchronization of tail flag + * when we call virt_to_head_page() since it is only called for + * already allocated page and this page won't be freed until + * this virt_to_head_page() is finished. So use _fast variant. + */ return compound_head_fast(page); } -- cgit v1.2.3 From 2f448ab6b87cf1bd34efe26d5930085ae19414e6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:22 +1100 Subject: mm: replace remap_file_pages() syscall with emulation remap_file_pages(2) was invented to be able efficiently map parts of huge file into limited 32-bit virtual address space such as in database workloads. Nonlinear mappings are pain to support and it seems there's no legitimate use-cases nowadays since 64-bit systems are widely available. Let's drop it and get rid of all these special-cased code. The patch replaces the syscall with emulation which creates new VMA on each remap_file_pages(), unless they it can be merged with an adjacent one. I didn't find *any* real code that uses remap_file_pages(2) to test emulation impact on. I've checked Debian code search and source of all packages in ALT Linux. No real users: libc wrappers, mentions in strace, gdb, valgrind and this kind of stuff. There are few basic tests in LTP for the syscall. They work just fine with emulation. To test performance impact, I've written small test case which demonstrate pretty much worst case scenario: map 4G shmfs file, write to begin of every page pgoff of the page, remap pages in reverse order, read every page. The test creates 1 million of VMAs if emulation is in use, so I had to set vm.max_map_count to 1100000 to avoid -ENOMEM. Before: 23.3 ( +- 4.31% ) seconds After: 43.9 ( +- 0.85% ) seconds Slowdown: 1.88x I believe we can live with that. Test case: #define _GNU_SOURCE #include #include #include #include #define MB (1024UL * 1024) #define SIZE (4096 * MB) int main(int argc, char **argv) { unsigned long *p; long i, pass; for (pass = 0; pass < 10; pass++) { p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return -1; } for (i = 0; i < SIZE / 4096; i++) p[i * 4096 / sizeof(*p)] = i; for (i = 0; i < SIZE / 4096; i++) { if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096, 0, (SIZE - 4096 * (i + 1)) >> 12, 0)) { perror("remap_file_pages"); return -1; } } for (i = SIZE / 4096 - 1; i >= 0; i--) assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1); munmap(p, SIZE); } return 0; } [akpm@linux-foundation.org: fix spello] [sasha.levin@oracle.com: initialize populate before usage] [sasha.levin@oracle.com: grab file ref to prevent race while mmaping] Signed-off-by: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Jones Cc: Linus Torvalds Cc: Armin Rigo Signed-off-by: Sasha Levin Cc: Hugh Dickins Signed-off-by: Andrew Morton --- Documentation/vm/remap_file_pages.txt | 7 +- include/linux/fs.h | 8 +- mm/Makefile | 2 +- mm/fremap.c | 283 ---------------------------------- mm/mmap.c | 69 +++++++++ mm/nommu.c | 8 - 6 files changed, 79 insertions(+), 298 deletions(-) delete mode 100644 mm/fremap.c (limited to 'include') diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt index 560e4363a55d..f609142f406a 100644 --- a/Documentation/vm/remap_file_pages.txt +++ b/Documentation/vm/remap_file_pages.txt @@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit virtual address space. This use-case is not critical anymore since 64-bit systems are widely available. -The plan is to deprecate the syscall and replace it with an emulation. -The emulation will create new VMAs instead of nonlinear mappings. It's -going to work slower for rare users of remap_file_pages() but ABI is -preserved. +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. One side effect of emulation (apart from performance) is that user can hit vm.max_map_count limit more easily due to additional VMAs. See comment for diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13077b6..60c4996df7f3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/Makefile b/mm/Makefile index 4bf586e66378..3548460ab7b6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2805d71cf476..000000000000 --- a/mm/fremap.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * linux/mm/fremap.c - * - * Explicit pagetable population and nonlinear (random) mappings support. - * - * started by Ingo Molnar, Copyright (C) 2002, 2003 - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "internal.h" - -static int mm_counter(struct page *page) -{ - return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; -} - -static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - struct page *page; - swp_entry_t entry; - - if (pte_present(pte)) { - flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush_notify(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); - if (page) { - if (pte_dirty(pte)) - set_page_dirty(page); - update_hiwater_rss(mm); - dec_mm_counter(mm, mm_counter(page)); - page_remove_rmap(page); - page_cache_release(page); - } - } else { /* zap_pte() is not called when pte_none() */ - if (!pte_file(pte)) { - update_hiwater_rss(mm); - entry = pte_to_swp_entry(pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - dec_mm_counter(mm, mm_counter(page)); - } - } else { - free_swap_and_cache(entry); - dec_mm_counter(mm, MM_SWAPENTS); - } - } - pte_clear_not_present_full(mm, addr, ptep, 0); - } -} - -/* - * Install a file pte to a given virtual memory address, release any - * previously existing mapping. - */ -static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) -{ - int err = -ENOMEM; - pte_t *pte, ptfile; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - ptfile = pgoff_to_pte(pgoff); - - if (!pte_none(*pte)) - zap_pte(mm, vma, addr, pte); - - set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); - /* - * We don't need to run update_mmu_cache() here because the "file pte" - * being installed by install_file_pte() is not a real pte - it's a - * non-present entry (like a swap entry), noting what file offset should - * be mapped there when there's a fault (in a non-linear vma where - * that's not obvious). - */ - pte_unmap_unlock(pte, ptl); - err = 0; -out: - return err; -} - -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - struct mm_struct *mm = vma->vm_mm; - int err; - - do { - err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); - if (err) - return err; - - size -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } while (size); - - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - -/** - * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma - * @start: start of the remapped virtual memory range - * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range (see NOTE) - * @pgoff: to-be-mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. - * - * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma - * (shared backing store file). - * - * This syscall works purely via pagetables, so it's the most efficient - * way to map the same (large) file into a given virtual window. Unlike - * mmap()/mremap() it does not create any new vmas. The new mappings are - * also safe across swapout. - * - * NOTE: the @prot parameter right now is ignored (but must be zero), - * and the vma's default protection is used. Arbitrary protections - * might be implemented in the future. - */ -SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, - unsigned long, prot, unsigned long, pgoff, unsigned long, flags) -{ - struct mm_struct *mm = current->mm; - struct address_space *mapping; - struct vm_area_struct *vma; - int err = -EINVAL; - int has_write_lock = 0; - vm_flags_t vm_flags = 0; - - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); - - if (prot) - return err; - /* - * Sanitize the syscall parameters: - */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; - - /* Does the address range wrap, or is the span zero-sized? */ - if (start + size <= start) - return err; - - /* Does pgoff wrap? */ - if (pgoff + (size >> PAGE_SHIFT) < pgoff) - return err; - - /* Can we represent this offset inside this architecture's pte's? */ -#if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) - return err; -#endif - - /* We need down_write() to change vma->vm_flags. */ - down_read(&mm->mmap_sem); - retry: - vma = find_vma(mm, start); - - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma. - */ - if (!vma || !(vma->vm_flags & VM_SHARED)) - goto out; - - if (!vma->vm_ops || !vma->vm_ops->remap_pages) - goto out; - - if (start < vma->vm_start || start + size > vma->vm_end) - goto out; - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (!(vma->vm_flags & VM_NONLINEAR)) { - /* - * vm_private_data is used as a swapout cursor - * in a VM_NONLINEAR vma. - */ - if (vma->vm_private_data) - goto out; - - /* Don't need a nonlinear mapping, exit success */ - if (pgoff == linear_page_index(vma, start)) { - err = 0; - goto out; - } - - if (!has_write_lock) { -get_write_lock: - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - /* - * page_mkclean doesn't work on nonlinear vmas, so if - * dirty pages need to be accounted, emulate with linear - * vmas. - */ - if (mapping_cap_account_dirty(mapping)) { - unsigned long addr; - struct file *file = get_file(vma->vm_file); - /* mmap_region may free vma; grab the info now */ - vm_flags = vma->vm_flags; - - addr = mmap_region(file, start, size, vm_flags, pgoff); - fput(file); - if (IS_ERR_VALUE(addr)) { - err = addr; - } else { - BUG_ON(addr != start); - err = 0; - } - goto out_freed; - } - i_mmap_lock_write(mapping); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_interval_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (vma->vm_flags & VM_LOCKED) { - /* - * drop PG_Mlocked flag for over-mapped range - */ - if (!has_write_lock) - goto get_write_lock; - vm_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = vm_flags; - } - - mmu_notifier_invalidate_range_start(mm, start, start + size); - err = vma->vm_ops->remap_pages(vma, start, size, pgoff); - mmu_notifier_invalidate_range_end(mm, start, start + size); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - -out: - if (vma) - vm_flags = vma->vm_flags; -out_freed: - if (likely(!has_write_lock)) - up_read(&mm->mmap_sem); - else - up_write(&mm->mmap_sem); - if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) - mm_populate(start, size); - - return err; -} diff --git a/mm/mmap.c b/mm/mmap.c index 7f684d5a8087..e023dc5e59a8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return vm_munmap(addr, len); } + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM diff --git a/mm/nommu.c b/mm/nommu.c index 28bd8c4dff6f..541bed64e348 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_map_pages); -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { -- cgit v1.2.3 From c5d3438201af6913a5c2475fd0685530e583566a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:22 +1100 Subject: mm: drop support of non-linear mapping from unmap/zap codepath We have remap_file_pages(2) emulation in -mm tree for few release cycles and we plan to have it mainline in v3.20. This patchset removes rest of VM_NONLINEAR infrastructure. Patches 1-8 take care about generic code. They are pretty straight-forward and can be applied without other of patches. Rest patches removes pte_file()-related stuff from architecture-specific code. It usually frees up one bit in non-present pte. I've tried to reuse that bit for swap offset, where I was able to figure out how to do that. For obvious reason I cannot test all that arch-specific code and would like to see acks from maintainers. In total, remap_file_pages(2) required about 1.4K lines of not-so-trivial kernel code. That's too much for functionality nobody uses. Tested-by: Felipe Balbi This patch (of 38): We don't create non-linear mappings anymore. Let's drop code which handles them on unmap/zap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/madvise.c | 9 +----- mm/memory.c | 82 ++++++++++++++---------------------------------------- 3 files changed, 22 insertions(+), 70 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c6fd3c5424a..600ef5ed4698 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1146,7 +1146,6 @@ extern void user_shm_unlock(size_t, struct user_struct *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ diff --git a/mm/madvise.c b/mm/madvise.c index a271adc93289..917754d26c17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - struct zap_details details = { - .nonlinear_vma = vma, - .last_index = ULONG_MAX, - }; - zap_page_range(vma, start, end - start, &details); - } else - zap_page_range(vma, start, end - start, NULL); + zap_page_range(vma, start, end - start, NULL); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 2c3536cc6c63..9a3e73b69dad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1082,6 +1082,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + swp_entry_t entry; again: init_rss_vec(rss); @@ -1107,28 +1108,12 @@ again: if (details->check_mapping && details->check_mapping != page->mapping) continue; - /* - * Each page->index must be checked when - * invalidating or truncating nonlinear. - */ - if (details->nonlinear_vma && - (page->index < details->first_index || - page->index > details->last_index)) - continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - addr) != page->index) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(ptent)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, addr, pte, ptfile); - } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -1151,33 +1136,25 @@ again: } continue; } - /* - * If details->check_mapping, we leave swap entries; - * if details->nonlinear_vma, we leave file entries. - */ + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; - if (pte_file(ptent)) { - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) - print_bad_pte(vma, addr, ptent, NULL); - } else { - swp_entry_t entry = pte_to_swp_entry(ptent); - if (!non_swap_entry(entry)) - rss[MM_SWAPENTS]--; - else if (is_migration_entry(entry)) { - struct page *page; + entry = pte_to_swp_entry(ptent); + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + else if (is_migration_entry(entry)) { + struct page *page; - page = migration_entry_to_page(entry); + page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; - } - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else + rss[MM_FILEPAGES]--; } + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1277,7 +1254,7 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping && !details->nonlinear_vma) + if (details && !details->check_mapping) details = NULL; BUG_ON(addr >= end); @@ -1371,7 +1348,7 @@ void unmap_vmas(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation + * @details: details of shared cache invalidation * * Caller must protect the VMA list */ @@ -1397,7 +1374,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation + * @details: details of shared cache invalidation * * The range must fit into one VMA. */ @@ -2331,25 +2308,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, } } -static inline void unmap_mapping_range_list(struct list_head *head, - struct zap_details *details) -{ - struct vm_area_struct *vma; - - /* - * In nonlinear VMAs there is no correspondence between virtual address - * offset and file offset. So we must perform an exhaustive search - * across *all* the pages in each nonlinear VMA, not just the pages - * whose virtual address lies outside the file truncation point. - */ - list_for_each_entry(vma, head, shared.nonlinear) { - details->nonlinear_vma = vma; - unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); - } -} - /** - * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. + * unmap_mapping_range - unmap the portion of all mmaps in the specified + * address_space corresponding to the specified page range in the underlying + * file. + * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE @@ -2378,7 +2341,6 @@ void unmap_mapping_range(struct address_space *mapping, } details.check_mapping = even_cows? NULL: mapping; - details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) @@ -2388,8 +2350,6 @@ void unmap_mapping_range(struct address_space *mapping, i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); - if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) - unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); i_mmap_unlock_write(mapping); } EXPORT_SYMBOL(unmap_mapping_range); -- cgit v1.2.3 From 57f9ab700e82f0183561525f0fb93469a60c49f5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:22 +1100 Subject: mm: drop support of non-linear mapping from fault codepath We don't create non-linear mappings anymore. Let's drop code which handles them on page fault. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++-------- mm/memory.c | 65 ++++++++---------------------------------------------- 2 files changed, 16 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 600ef5ed4698..376e5c325dee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,21 +206,19 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ -#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x40 /* second try */ -#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ +#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ +#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ +#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ +#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x20 /* Second try */ +#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * - * pgoff should be used in favour of virtual_address, if possible. If pgoff - * is used, one may implement ->remap_pages to get nonlinear mapping support. + * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { unsigned int flags; /* FAULT_FLAG_xxx flags */ diff --git a/mm/memory.c b/mm/memory.c index 9a3e73b69dad..43a53743cbb4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1899,12 +1899,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, EXPORT_SYMBOL_GPL(apply_to_page_range); /* - * handle_pte_fault chooses page fault handler according to an entry - * which was read non-atomically. Before making any commitment, on - * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault - * must check under lock before unmapping the pte and proceeding - * (but do_wp_page is only called after already making such a check; + * handle_pte_fault chooses page fault handler according to an entry which was + * read non-atomically. Before making any commitment, on those architectures + * or configurations (e.g. i386 with PAE) which might give a mix of unmatched + * parts, do_swap_page must check under lock before unmapping the pte and + * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, @@ -2710,8 +2709,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -2846,8 +2843,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_bytes >> PAGE_SHIFT > 1) { + if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) @@ -2992,7 +2988,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { @@ -3009,46 +3005,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -/* - * Fault of a previously existing named mapping. Repopulate the pte - * from the encoded file_pte if possible. This enables swappable - * nonlinear vmas. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with pte unmapped and unlocked. - * The mmap_sem may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). - */ -static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) -{ - pgoff_t pgoff; - - flags |= FAULT_FLAG_NONLINEAR; - - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return 0; - - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_SIGBUS; - } - - pgoff = pte_to_pgoff(orig_pte); - if (!(flags & FAULT_FLAG_WRITE)) - return do_read_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); -} - static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) @@ -3176,15 +3132,12 @@ static int handle_pte_fault(struct mm_struct *mm, if (pte_none(entry)) { if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) - return do_linear_fault(mm, vma, address, - pte, pmd, flags, entry); + return do_fault(mm, vma, address, pte, + pmd, flags, entry); } return do_anonymous_page(mm, vma, address, pte, pmd, flags); } - if (pte_file(entry)) - return do_nonlinear_fault(mm, vma, address, - pte, pmd, flags, entry); return do_swap_page(mm, vma, address, pte, pmd, flags, entry); } -- cgit v1.2.3 From b9d17a2597687a6483eb5df10224c3ce05cb33d5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:22 +1100 Subject: mm: drop vm_ops->remap_pages and generic_file_remap_pages() stub Nobody uses it anymore. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- fs/9p/vfs_file.c | 2 -- fs/btrfs/file.c | 1 - fs/ceph/addr.c | 1 - fs/cifs/file.c | 1 - fs/ext4/file.c | 1 - fs/f2fs/file.c | 1 - fs/fuse/file.c | 1 - fs/gfs2/file.c | 1 - fs/nfs/file.c | 1 - fs/nilfs2/file.c | 1 - fs/ocfs2/mmap.c | 1 - fs/ubifs/file.c | 1 - fs/xfs/xfs_file.c | 1 - include/linux/fs.h | 6 ------ include/linux/mm.h | 3 --- mm/filemap.c | 1 - mm/shmem.c | 1 - 17 files changed, 25 deletions(-) (limited to 'include') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5594505e6e73..b40133796b87 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { @@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4090259569b..a606ab551296 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c81c0e004588..24be059fd1f8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1569,7 +1569,6 @@ out: static struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 96b7e9b7706d..9f4d03954ecd 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3244,7 +3244,6 @@ static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af3..7cb592386121 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -195,7 +195,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3c27e0ecb3bc..5674ba13102b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -92,7 +92,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int get_parent_ino(struct inode *inode, nid_t *pino) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 760b2c552197..d769e594855b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600abf694a..ec9c2d33477a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /** diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2ab6f00dba5b..94712fc781fa 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 3a03e0aea1fb..a8c728acb7a8 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 10d66c75cecb..9581d190f6e1 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,7 +173,6 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 538519ee37d9..035e51011444 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 13e974e6a889..ac7f1e8f92b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1384,5 +1384,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 60c4996df7f3..47f557c7ef7e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,12 +2481,6 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -static inline int generic_file_remap_pages(struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 376e5c325dee..2ddd9d1d6268 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,9 +285,6 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr); #endif - /* called by sys_remap_file_pages() to populate non-linear mapping */ - int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff); }; struct mmu_gather; diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e5..bf7a27142704 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ diff --git a/mm/shmem.c b/mm/shmem.c index 73ba1df7c8ba..05a2d70a9244 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif - .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, -- cgit v1.2.3 From 1fd2d247a81eeeaebf38850ee861fbd7b6940b4d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:23 +1100 Subject: rmap: drop support of non-linear mappings We don't create non-linear mappings anymore. Let's drop code which handles them in rmap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- Documentation/cachetlb.txt | 8 +- fs/inode.c | 1 - include/linux/fs.h | 4 +- include/linux/mm.h | 6 -- include/linux/mm_types.h | 4 +- include/linux/rmap.h | 2 - kernel/fork.c | 8 +- mm/migrate.c | 32 ------- mm/mmap.c | 24 ++--- mm/rmap.c | 225 +-------------------------------------------- mm/swap.c | 4 +- 11 files changed, 18 insertions(+), 300 deletions(-) (limited to 'include') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index d79b008e4a32..3f9f808b5119 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -317,10 +317,10 @@ maps this page at its virtual address. about doing this. The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear - an empty list, just mark the architecture private page flag bit. - Later, in update_mmu_cache(), a check is made of this flag bit, - and if set the flush is done and the flag bit is cleared. + page->mapping->i_mmap is an empty tree, just mark the architecture + private page flag bit. Later, in update_mmu_cache(), a check is + made of this flag bit, and if set the flush is done and the flag + bit is cleared. IMPORTANT NOTE: It is often important, if you defer the flush, that the actual flush occurs on the same CPU diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..c760fac33c92 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping) INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 47f557c7ef7e..60acab209701 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,6 @@ struct address_space { spinlock_t tree_lock; /* and lock protecting it */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ - struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ @@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping) */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear); + return !RB_EMPTY_ROOT(&mapping->i_mmap); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ddd9d1d6268..18391eec4864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - list_add_tail(&vma->shared.nonlinear, list); -} - void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6d34aa266a8c..3b1d20fb0848 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -273,15 +273,13 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree, or - * linkage of vma in the address_space->i_mmap_nonlinear list. + * linkage into the address_space->i_mmap interval tree. */ union { struct { struct rb_node rb; unsigned long rb_subtree_last; } linear; - struct list_head nonlinear; } shared; /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d9d7e7e56352..b38f559130d5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition - * file_nonlinear: for handling file nonlinear mapping * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ @@ -255,7 +254,6 @@ struct rmap_walk_control { int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - int (*file_nonlinear)(struct page *, struct address_space *, void *arg); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; diff --git a/kernel/fork.c b/kernel/fork.c index 4dc2ddade9f1..b379d9abddc7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc8..6e284bcca8bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,37 +178,6 @@ out: return SWAP_AGAIN; } -/* - * Congratulations to trinity for discovering this bug. - * mm/fremap.c's remap_file_pages() accepts any range within a single vma to - * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then - * replace the specified range by file ptes throughout (maybe populated after). - * If page migration finds a page within that range, while it's still located - * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: - * zap_pte() clears the temporary migration entry before mmap_sem is dropped. - * But if the migrating page is in a part of the vma outside the range to be - * remapped, then it will not be cleared, and remove_migration_ptes() needs to - * deal with it. Fortunately, this part of the vma is of course still linear, - * so we just need to use linear location on the nonlinear list. - */ -static int remove_linear_migration_ptes_from_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long addr; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr >= vma->vm_start && addr < vma->vm_end) - remove_migration_pte(page, vma, addr, arg); - } - return SWAP_AGAIN; -} - /* * Get rid of all migration entries and replace them by * references to the indicated page. @@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, - .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); diff --git a/mm/mmap.c b/mm/mmap.c index e023dc5e59a8..14d84666e8ba 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.nonlinear); - else - vma_interval_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } @@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) { - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); - if (adjust_next) - uprobe_munmap(next, next->vm_start, - next->vm_end); - } + if (adjust_next) + uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); if (insert) { @@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * * mmap_sem in write mode is required in order to block all operations * that could modify pagetables and free pages without need of - * altering the vma layout (for example populate_range() with - * nonlinear vmas). It's also needed in write mode to avoid new + * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row diff --git a/mm/rmap.c b/mm/rmap.c index 71cd5bd0c17d..70b32498d4f2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { - if (!vma->vm_file || - vma->vm_file->f_mapping != page->mapping) + } else if (page->mapping) { + if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) return -EFAULT; } else return -EFAULT; @@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ @@ -1316,211 +1314,6 @@ out_mlock: return ret; } -/* - * objrmap doesn't work for nonlinear VMAs because the assumption that - * offset-into-file correlates with offset-into-virtual-addresses does not hold. - * Consequently, given a particular page and its ->index, we cannot locate the - * ptes which are mapping that page without an exhaustive linear search. - * - * So what this code does is a mini "virtual scan" of each nonlinear VMA which - * maps the file to which the target page belongs. The ->vm_private_data field - * holds the current cursor into that scan. Successive searches will circulate - * around the vma's virtual address space. - * - * So as more replacement pressure is applied to the pages in a nonlinear VMA, - * more scanning pressure is placed against them as well. Eventually pages - * will become fully unmapped and are eligible for eviction. - * - * For very sparsely populated VMAs this is a little inefficient - chances are - * there there won't be many ptes located within the scan cluster. In this case - * maybe we could scan further - to the end of the pte page, perhaps. - * - * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can - * acquire it without blocking. If vma locked, mlock the pages in the cluster, - * rather than unmapping them. If we encounter the "check_page" that vmscan is - * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. - */ -#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) -#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) - -static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, - struct vm_area_struct *vma, struct page *check_page) -{ - struct mm_struct *mm = vma->vm_mm; - pmd_t *pmd; - pte_t *pte; - pte_t pteval; - spinlock_t *ptl; - struct page *page; - unsigned long address; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - unsigned long end; - int ret = SWAP_AGAIN; - int locked_vma = 0; - - address = (vma->vm_start + cursor) & CLUSTER_MASK; - end = address + CLUSTER_SIZE; - if (address < vma->vm_start) - address = vma->vm_start; - if (end > vma->vm_end) - end = vma->vm_end; - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return ret; - - mmun_start = address; - mmun_end = end; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - /* - * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, - * keep the sem while scanning the cluster for mlocking pages. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - locked_vma = (vma->vm_flags & VM_LOCKED); - if (!locked_vma) - up_read(&vma->vm_mm->mmap_sem); /* don't need it */ - } - - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - - for (; address < end; pte++, address += PAGE_SIZE) { - if (!pte_present(*pte)) - continue; - page = vm_normal_page(vma, address, *pte); - BUG_ON(!page || PageAnon(page)); - - if (locked_vma) { - if (page == check_page) { - /* we know we have check_page locked */ - mlock_vma_page(page); - ret = SWAP_MLOCK; - } else if (trylock_page(page)) { - /* - * If we can lock the page, perform mlock. - * Otherwise leave the page alone, it will be - * eventually encountered again later. - */ - mlock_vma_page(page); - unlock_page(page); - } - continue; /* don't unmap */ - } - - /* - * No need for _notify because we're within an - * mmu_notifier_invalidate_range_ {start|end} scope. - */ - if (ptep_clear_flush_young(vma, address, pte)) - continue; - - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); - - /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(pteval)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, address, pte, ptfile); - } - - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); - - page_remove_rmap(page); - page_cache_release(page); - dec_mm_counter(mm, MM_FILEPAGES); - (*mapcount)--; - } - pte_unmap_unlock(pte - 1, ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (locked_vma) - up_read(&vma->vm_mm->mmap_sem); - return ret; -} - -static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - unsigned long cursor; - unsigned long max_nl_cursor = 0; - unsigned long max_nl_size = 0; - unsigned int mapcount; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - if (cursor > max_nl_cursor) - max_nl_cursor = cursor; - cursor = vma->vm_end - vma->vm_start; - if (cursor > max_nl_size) - max_nl_size = cursor; - } - - if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - return SWAP_FAIL; - } - - /* - * We don't try to search for this page in the nonlinear vmas, - * and page_referenced wouldn't have found it anyway. Instead - * just walk the nonlinear vmas trying to age and unmap some. - * The mapcount of the page we came in with is irrelevant, - * but even so use it as a guide to how hard we should try? - */ - mapcount = page_mapcount(page); - if (!mapcount) - return ret; - - cond_resched(); - - max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; - if (max_nl_cursor == 0) - max_nl_cursor = CLUSTER_SIZE; - - do { - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - while (cursor < max_nl_cursor && - cursor < vma->vm_end - vma->vm_start) { - if (try_to_unmap_cluster(cursor, &mapcount, - vma, page) == SWAP_MLOCK) - ret = SWAP_MLOCK; - cursor += CLUSTER_SIZE; - vma->vm_private_data = (void *) cursor; - if ((int)mapcount <= 0) - return ret; - } - vma->vm_private_data = (void *) max_nl_cursor; - } - cond_resched(); - max_nl_cursor += CLUSTER_SIZE; - } while (max_nl_cursor <= max_nl_size); - - /* - * Don't loop forever (perhaps all the remaining pages are - * in locked vmas). Reset cursor on all unreserved nonlinear - * vmas, now forgetting on which ones it had fallen behind. - */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) - vma->vm_private_data = NULL; - - return ret; -} - bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = page_not_mapped, - .file_nonlinear = try_to_unmap_nonlinear, .anon_lock = page_lock_anon_vma_read, }; @@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page) .rmap_one = try_to_unmap_one, .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, - /* - * We don't bother to try to find the munlocked page in - * nonlinears. It's costly. Instead, later, page reclaim logic - * may call try_to_unmap() and recover PG_mlocked lazily. - */ - .file_nonlinear = NULL, .anon_lock = page_lock_anon_vma_read, }; @@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) goto done; } - if (!rwc->file_nonlinear) - goto done; - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto done; - - ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: i_mmap_unlock_read(mapping); return ret; diff --git a/mm/swap.c b/mm/swap.c index 8a12b33936b4..5b3087228b99 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1140,10 +1140,8 @@ void __init swap_setup(void) if (bdi_init(swapper_spaces[0].backing_dev_info)) panic("Failed to init swap bdi"); - for (i = 0; i < MAX_SWAPFILES; i++) { + for (i = 0; i < MAX_SWAPFILES; i++) spin_lock_init(&swapper_spaces[i].tree_lock); - INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); - } #endif /* Use a smaller cluster for small-memory machines */ -- cgit v1.2.3 From 61ed01c9322990a4abb175a2e64179135a136a07 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:23 +1100 Subject: mm: replace vma->sharead.linear with vma->shared After removing vma->shared.nonlinear we have only one member of vma->shared union, which doesn't make much sense. This patch drops the union and move struct vma->shared.linear to vma->shared. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 +++----- mm/interval_tree.c | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b1d20fb0848..07c8bd3f7b48 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -275,11 +275,9 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */ - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } linear; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; } shared; /* diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 8da581fa9060..f2c2492681bf 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; } -INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, - unsigned long, shared.linear.rb_subtree_last, +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, + unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ @@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); - if (!prev->shared.linear.rb.rb_right) { + if (!prev->shared.rb.rb_right) { parent = prev; - link = &prev->shared.linear.rb.rb_right; + link = &prev->shared.rb.rb_right; } else { - parent = rb_entry(prev->shared.linear.rb.rb_right, - struct vm_area_struct, shared.linear.rb); - if (parent->shared.linear.rb_subtree_last < last) - parent->shared.linear.rb_subtree_last = last; - while (parent->shared.linear.rb.rb_left) { - parent = rb_entry(parent->shared.linear.rb.rb_left, - struct vm_area_struct, shared.linear.rb); - if (parent->shared.linear.rb_subtree_last < last) - parent->shared.linear.rb_subtree_last = last; + parent = rb_entry(prev->shared.rb.rb_right, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + while (parent->shared.rb.rb_left) { + parent = rb_entry(parent->shared.rb.rb_left, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; } - link = &parent->shared.linear.rb.rb_left; + link = &parent->shared.rb.rb_left; } - node->shared.linear.rb_subtree_last = last; - rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); - rb_insert_augmented(&node->shared.linear.rb, root, + node->shared.rb_subtree_last = last; + rb_link_node(&node->shared.rb, &parent->shared.rb, link); + rb_insert_augmented(&node->shared.rb, root, &vma_interval_tree_augment); } -- cgit v1.2.3 From e3295b5532cb90fc791f6f8fef87af3598d18b70 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:23 +1100 Subject: mm: remove rest usage of VM_NONLINEAR and pte_file() One bit in ->vm_flags is unused now! Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_vma_manager.c | 3 +- include/linux/mm.h | 1 - include/linux/swapops.h | 4 +- mm/debug.c | 1 - mm/gup.c | 2 +- mm/ksm.c | 2 +- mm/madvise.c | 4 +- mm/memcontrol.c | 4 +- mm/memory.c | 78 +++++++++++++++++++-------------------- mm/mincore.c | 9 +---- mm/mprotect.c | 2 +- mm/mremap.c | 2 - mm/msync.c | 5 +-- 13 files changed, 48 insertions(+), 69 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c index 63b471205072..68c1f32fb086 100644 --- a/drivers/gpu/drm/drm_vma_manager.c +++ b/drivers/gpu/drm/drm_vma_manager.c @@ -50,8 +50,7 @@ * * You must not use multiple offset managers on a single address_space. * Otherwise, mm-core will be unable to tear down memory mappings as the VM will - * no longer be linear. Please use VM_NONLINEAR in that case and implement your - * own offset managers. + * no longer be linear. * * This offset manager works on page-based addresses. That is, every argument * and return code (with the exception of drm_vma_node_offset_addr()) is given diff --git a/include/linux/mm.h b/include/linux/mm.h index 18391eec4864..a0da685bdb82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_ARCH_2 0x02000000 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6adfb7bfbf44..50cbc876be56 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte); } #endif @@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; - BUG_ON(pte_file(pte)); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); @@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); - BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } diff --git a/mm/debug.c b/mm/debug.c index 0e58f3211f89..d69cb5a7ba9a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_ACCOUNT, "account" }, {VM_NORESERVE, "noreserve" }, {VM_HUGETLB, "hugetlb" }, - {VM_NONLINEAR, "nonlinear" }, #if defined(CONFIG_X86) {VM_PAT, "pat" }, #elif defined(CONFIG_PPC) diff --git a/mm/gup.c b/mm/gup.c index 8dd50ce6326f..12bc2bc33da7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -55,7 +55,7 @@ retry: */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; - if (pte_none(pte) || pte_file(pte)) + if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) diff --git a/mm/ksm.c b/mm/ksm.c index 15647fb0394f..4162dce2eb44 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/madvise.c b/mm/madvise.c index 917754d26c17..d79fb5e8f80a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, pte = *(orig_pte + ((index - start) / PAGE_SIZE)); pte_unmap_unlock(orig_pte, ptl); - if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + if (pte_present(pte) || pte_none(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) @@ -296,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma, *prev = NULL; /* tell sys_madvise we drop mmap_sem */ - if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) return -EINVAL; f = vma->vm_file; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 683b4782019b..974fc4288bb5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4928,8 +4928,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, mapping = vma->vm_file->f_mapping; if (pte_none(ptent)) pgoff = linear_page_index(vma, addr); - else /* pte_file(ptent) is true */ - pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ #ifdef CONFIG_SWAP @@ -4961,7 +4959,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, addr, ptent, &ent); - else if (pte_none(ptent) || pte_file(ptent)) + else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) diff --git a/mm/memory.c b/mm/memory.c index 43a53743cbb4..9aa09217fe20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -811,42 +811,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { - if (!pte_file(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (likely(!non_swap_entry(entry))) { - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; - - if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both - * parent and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) - pte = pte_swp_mksoft_dirty(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + set_pte_at(src_mm, addr, src_pte, pte); } } goto out_set_pte; @@ -1020,11 +1018,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | - VM_PFNMAP | VM_MIXEDMAP))) { - if (!vma->anon_vma) - return 0; - } + if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !vma->anon_vma) + return 0; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); diff --git a/mm/mincore.c b/mm/mincore.c index c8c528b36641..46527c023e0c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { pte_t pte = *ptep; - pgoff_t pgoff; next = addr + PAGE_SIZE; if (pte_none(pte)) mincore_unmapped_range(vma, addr, next, vec); else if (pte_present(pte)) *vec = 1; - else if (pte_file(pte)) { - pgoff = pte_to_pgoff(pte); - *vec = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { /* pte is a swap entry */ + else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, *vec = 1; } else { #ifdef CONFIG_SWAP - pgoff = entry.val; *vec = mincore_page(swap_address_space(entry), - pgoff); + entry.val); #else WARN_ON(1); *vec = 1; diff --git a/mm/mprotect.c b/mm/mprotect.c index ace93454ce8e..33121662f08b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } if (updated) pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { + } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { diff --git a/mm/mremap.c b/mm/mremap.c index 17fa018f5f39..57dadc025c64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) pte = pte_mksoft_dirty(pte); else if (is_swap_pte(pte)) pte = pte_swp_mksoft_dirty(pte); - else if (pte_file(pte)) - pte = pte_file_mksoft_dirty(pte); #endif return pte; } diff --git a/mm/msync.c b/mm/msync.c index 992a1673d488..bb04d53ae852 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - if (vma->vm_flags & VM_NONLINEAR) - error = vfs_fsync(file, 1); - else - error = vfs_fsync_range(file, fstart, fend, 1); + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; -- cgit v1.2.3 From 377d0333e532af96c7fff8402ca35a271a1691b6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:24 +1100 Subject: asm-generic: drop unused pte_file* helpers All users are gone. Signed-off-by: Kirill A. Shutemov Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/asm-generic/pgtable.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 177d5973b132..129de9204d18 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -474,21 +474,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } - -static inline pte_t pte_file_clear_soft_dirty(pte_t pte) -{ - return pte; -} - -static inline pte_t pte_file_mksoft_dirty(pte_t pte) -{ - return pte; -} - -static inline int pte_file_soft_dirty(pte_t pte) -{ - return 0; -} #endif #ifndef __HAVE_PFNMAP_TRACKING -- cgit v1.2.3 From 0735f0d5e52cd3ba289027cafa9f0187ab55c5da Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 30 Jan 2015 13:11:32 +1100 Subject: mm: hugetlb: fix type of hugetlb_treat_as_movable variable hugetlb_treat_as_movable declared as unsigned long, but proc_dointvec() used for parsing it: static struct ctl_table vm_table[] = { ... { .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, This seems harmless, but it's better to use int type here. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Manfred Spraul Acked-by: David Rientjes Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 431b7fc605c9..7d7856359920 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -86,7 +86,7 @@ void free_huge_page(struct page *page); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif -extern unsigned long hugepages_treat_as_movable; +extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85032de5e20f..be0e5d0db5ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,7 @@ #include #include "internal.h" -unsigned long hugepages_treat_as_movable; +int hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; -- cgit v1.2.3 From 25c34679513f373002c9a0092c3f31629dc53cd5 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:32 +1100 Subject: memcg: zap __memcg_{charge,uncharge}_slab They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's zap them and call these functions directly. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 21 +++------------------ mm/slab.h | 4 ++-- 3 files changed, 8 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7c95af8d552c..18ccb2988979 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); int __memcg_cleanup_cache_params(struct kmem_cache *s); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f68ff3ede73..a12e827694e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2495,8 +2495,8 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) { struct page_counter *counter; int ret = 0; @@ -2533,8 +2533,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) @@ -2767,20 +2766,6 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) -{ - unsigned int nr_pages = 1 << order; - - return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); -} - -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) -{ - unsigned int nr_pages = 1 << order; - - memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); -} - /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. diff --git a/mm/slab.h b/mm/slab.h index 1cf4005482dd..90430d6f665e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return __memcg_charge_slab(s, gfp, order); + return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - __memcg_uncharge_slab(s, order); + memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); } #else static inline bool is_root_cache(struct kmem_cache *s) -- cgit v1.2.3 From e1463251705c14b8bd0a12dfd1981db650616836 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:33 +1100 Subject: memcg: zap memcg_name argument of memcg_create_kmem_cache Instead of passing the name of the memory cgroup which the cache is created for in the memcg_name_argument, let's obtain it immediately in memcg_create_kmem_cache. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/slab.h | 3 +-- mm/memcontrol.c | 5 +---- mm/slab_common.c | 9 +++++---- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9a139b637069..eca9ed303a1b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -117,8 +117,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *, - const char *); + struct kmem_cache *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a12e827694e7..85cbad080bdc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2607,8 +2607,6 @@ void memcg_update_array_size(int num) static void memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char memcg_name_buf[NAME_MAX + 1]; /* protected by - memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -2624,8 +2622,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); + cachep = memcg_create_kmem_cache(memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/slab_common.c b/mm/slab_common.c index e03dd6f2a272..b958f27d1833 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -430,16 +430,15 @@ EXPORT_SYMBOL(kmem_cache_create); * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. - * @memcg_name: The name of the memory cgroup (used for naming the new cache). * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache, - const char *memcg_name) + struct kmem_cache *root_cache) { + static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ struct kmem_cache *s = NULL; char *cache_name; @@ -448,8 +447,10 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + cgroup_name(mem_cgroup_css(memcg)->cgroup, + memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name); + memcg_cache_id(memcg), memcg_name_buf); if (!cache_name) goto out_unlock; -- cgit v1.2.3 From 16e253c421179f1d60f9aeb02c99ca01ece3a9a8 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:33 +1100 Subject: memcg: zap memcg_slab_caches and memcg_slab_mutex mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to the given cgroup. Currently, it is only used on css free in order to destroy all caches corresponding to the memory cgroup being freed. The list is protected by memcg_slab_mutex. The mutex is also used to protect kmem_cache->memcg_params->memcg_caches arrays and synchronizes kmem_cache_destroy vs memcg_unregister_all_caches. However, we can perfectly get on without these two. To destroy all caches corresponding to a memory cgroup, we can walk over the global list of kmem caches, slab_caches, and we can do all the synchronization stuff using the slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid of the memcg_slab_caches and memcg_slab_mutex. Apart from this nice cleanup, it also: - assures that rcu_barrier() is called once at max when a root cache is destroyed or a memory cgroup is freed, no matter how many caches have SLAB_DESTROY_BY_RCU flag set; - fixes the race between kmem_cache_destroy and kmem_cache_create that exists, because memcg_cleanup_cache_params, which is called from kmem_cache_destroy after checking that kmem_cache->refcount=0, releases the slab_mutex, which gives kmem_cache_create a chance to make an alias to a cache doomed to be destroyed. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Acked-by: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 - include/linux/slab.h | 6 +- mm/memcontrol.c | 156 +++++---------------------------------------- mm/slab_common.c | 142 +++++++++++++++++++++++++++++------------ 4 files changed, 120 insertions(+), 186 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18ccb2988979..fb212e1d700d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -int __memcg_cleanup_cache_params(struct kmem_cache *s); - /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index eca9ed303a1b..2e3b448cfa2d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *); +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to - * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { @@ -502,7 +501,6 @@ struct memcg_cache_params { }; struct { struct mem_cgroup *memcg; - struct list_head list; struct kmem_cache *root_cache; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85cbad080bdc..45d47c2ddda5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -343,9 +343,6 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list, but per-memcg; - * protected by memcg_slab_mutex */ - struct list_head memcg_slab_caches; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2476,25 +2473,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -/* - * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or - * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. - */ -static DEFINE_MUTEX(memcg_slab_mutex); - -/* - * This is a bit cumbersome, but it is rarely used and avoids a backpointer - * in the memcg_cache_params struct. - */ -static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) -{ - struct kmem_cache *cachep; - - VM_BUG_ON(p->is_root_cache); - cachep = p->root_cache; - return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); -} - int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages) { @@ -2578,10 +2556,7 @@ static int memcg_alloc_cache_id(void) else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(size); - mutex_unlock(&memcg_slab_mutex); - if (err) { ida_simple_remove(&kmem_limited_groups, id); return err; @@ -2604,120 +2579,20 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = num; } -static void memcg_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - struct kmem_cache *cachep; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - id = memcg_cache_id(memcg); - - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, id)) - return; - - cachep = memcg_create_kmem_cache(memcg, root_cache); - /* - * If we could not create a memcg cache, do not complain, because - * that's not critical at all as we can always proceed with the root - * cache. - */ - if (!cachep) - return; - - list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); - - /* - * Since readers won't lock (see cache_from_memcg_idx()), we need a - * barrier here to ensure nobody will see the kmem_cache partially - * initialized. - */ - smp_wmb(); - - BUG_ON(root_cache->memcg_params->memcg_caches[id]); - root_cache->memcg_params->memcg_caches[id] = cachep; -} - -static void memcg_unregister_cache(struct kmem_cache *cachep) -{ - struct kmem_cache *root_cache; - struct mem_cgroup *memcg; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - BUG_ON(is_root_cache(cachep)); - - root_cache = cachep->memcg_params->root_cache; - memcg = cachep->memcg_params->memcg; - id = memcg_cache_id(memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); - root_cache->memcg_params->memcg_caches[id] = NULL; - - list_del(&cachep->memcg_params->list); - - kmem_cache_destroy(cachep); -} - -int __memcg_cleanup_cache_params(struct kmem_cache *s) -{ - struct kmem_cache *c; - int i, failed = 0; - - mutex_lock(&memcg_slab_mutex); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - - memcg_unregister_cache(c); - - if (cache_from_memcg_idx(s, i)) - failed++; - } - mutex_unlock(&memcg_slab_mutex); - return failed; -} - -static void memcg_unregister_all_caches(struct mem_cgroup *memcg) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *params, *tmp; - - if (!memcg_kmem_is_active(memcg)) - return; - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { - cachep = memcg_params_to_cache(params); - memcg_unregister_cache(cachep); - } - mutex_unlock(&memcg_slab_mutex); -} - -struct memcg_register_cache_work { +struct memcg_kmem_cache_create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_register_cache_func(struct work_struct *w) +static void memcg_kmem_cache_create_func(struct work_struct *w) { - struct memcg_register_cache_work *cw = - container_of(w, struct memcg_register_cache_work, work); + struct memcg_kmem_cache_create_work *cw = + container_of(w, struct memcg_kmem_cache_create_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - mutex_lock(&memcg_slab_mutex); - memcg_register_cache(memcg, cachep); - mutex_unlock(&memcg_slab_mutex); + memcg_create_kmem_cache(memcg, cachep); css_put(&memcg->css); kfree(cw); @@ -2726,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct memcg_register_cache_work *cw; + struct memcg_kmem_cache_create_work *cw; cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (!cw) @@ -2739,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; + INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_register_cache will recurse. + * in __memcg_schedule_kmem_cache_create will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -2759,7 +2634,7 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ current->memcg_kmem_skip_account = 1; - __memcg_schedule_register_cache(memcg, cachep); + __memcg_schedule_kmem_cache_create(memcg, cachep); current->memcg_kmem_skip_account = 0; } @@ -2807,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) * could happen with the slab_mutex held. So it's better to * defer everything. */ - memcg_schedule_register_cache(memcg, cachep); + memcg_schedule_kmem_cache_create(memcg, cachep); out: css_put(&memcg->css); return cachep; @@ -4136,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { - memcg_unregister_all_caches(memcg); + memcg_destroy_kmem_caches(memcg); mem_cgroup_sockets_destroy(memcg); } #else @@ -4664,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&memcg->event_list_lock); #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); #endif return &memcg->css; diff --git a/mm/slab_common.c b/mm/slab_common.c index b958f27d1833..481cf81eadc3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -425,6 +425,49 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); +static int do_kmem_cache_shutdown(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + return -EBUSY; + } + + if (s->flags & SLAB_DESTROY_BY_RCU) + *need_rcu_barrier = true; + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) { + struct kmem_cache *root_cache = s->memcg_params->root_cache; + int memcg_id = memcg_cache_id(s->memcg_params->memcg); + + BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); + root_cache->memcg_params->memcg_caches[memcg_id] = NULL; + } +#endif + list_move(&s->list, release); + return 0; +} + +static void do_kmem_cache_release(struct list_head *release, + bool need_rcu_barrier) +{ + struct kmem_cache *s, *s2; + + if (need_rcu_barrier) + rcu_barrier(); + + list_for_each_entry_safe(s, s2, release, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + } +} + #ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. @@ -435,10 +478,11 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +void memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ + int memcg_id = memcg_cache_id(memcg); struct kmem_cache *s = NULL; char *cache_name; @@ -447,6 +491,14 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_id)) + goto out_unlock; + cgroup_name(mem_cgroup_css(memcg)->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, @@ -458,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ if (IS_ERR(s)) { kfree(cache_name); - s = NULL; + goto out_unlock; } + /* + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. + */ + smp_wmb(); + root_cache->memcg_params->memcg_caches[memcg_id] = s; + out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - return s; } -static int memcg_cleanup_cache_params(struct kmem_cache *s) +void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - int rc; + LIST_HEAD(release); + bool need_rcu_barrier = false; + struct kmem_cache *s, *s2; - if (!s->memcg_params || - !s->memcg_params->is_root_cache) - return 0; + get_online_cpus(); + get_online_mems(); - mutex_unlock(&slab_mutex); - rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (is_root_cache(s) || s->memcg_params->memcg != memcg) + continue; + /* + * The cgroup is about to be freed and therefore has no charges + * left. Hence, all its caches must be empty by now. + */ + BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + } + mutex_unlock(&slab_mutex); - return rc; -} -#else -static int memcg_cleanup_cache_params(struct kmem_cache *s) -{ - return 0; + put_online_mems(); + put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { + memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } void kmem_cache_destroy(struct kmem_cache *s) { + int i; + LIST_HEAD(release); + bool need_rcu_barrier = false; + bool busy = false; + get_online_cpus(); get_online_mems(); @@ -510,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (memcg_cleanup_cache_params(s) != 0) - goto out_unlock; + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); - goto out_unlock; + if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + busy = true; } - list_del(&s->list); - - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); -#else - slab_kmem_cache_release(s); -#endif - goto out; + if (!busy) + do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); out_unlock: mutex_unlock(&slab_mutex); -out: + put_online_mems(); put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3 From 4f442cd0b00be2902228ab12b04a13041488ac20 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:33 +1100 Subject: mm: add fields for compound destructor and order into struct page Currently, we use lru.next/lru.prev plus cast to access or set destructor and order of compound page. Let's replace it with explicit fields in struct page. Signed-off-by: Kirill A. Shutemov Acked-by: Jerome Marchand Acked-by: Christoph Lameter Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 ++++----- include/linux/mm_types.h | 8 ++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a0da685bdb82..65b9d7b2a4cf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -620,29 +620,28 @@ int split_free_page(struct page *page); * prototype for that function and accessor functions. * These are _only_ valid on the head of a PG_compound page. */ -typedef void compound_page_dtor(struct page *); static inline void set_compound_page_dtor(struct page *page, compound_page_dtor *dtor) { - page[1].lru.next = (void *)dtor; + page[1].compound_dtor = dtor; } static inline compound_page_dtor *get_compound_page_dtor(struct page *page) { - return (compound_page_dtor *)page[1].lru.next; + return page[1].compound_dtor; } static inline int compound_order(struct page *page) { if (!PageHead(page)) return 0; - return (unsigned long)page[1].lru.prev; + return page[1].compound_order; } static inline void set_compound_order(struct page *page, unsigned long order) { - page[1].lru.prev = (void *)order; + page[1].compound_order = order; } #ifdef CONFIG_MMU diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 07c8bd3f7b48..20ff2105b564 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -28,6 +28,8 @@ struct mem_cgroup; IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) +typedef void compound_page_dtor(struct page *); + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -142,6 +144,12 @@ struct page { struct rcu_head rcu_head; /* Used by SLAB * when destroying via RCU */ + /* First tail page of compound page */ + struct { + compound_page_dtor *compound_dtor; + unsigned long compound_order; + }; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page->ptl */ #endif -- cgit v1.2.3 From 93c321ea143419a547f60427eca0f574ae6253df Mon Sep 17 00:00:00 2001 From: "Wang, Yalin" Date: Fri, 30 Jan 2015 13:11:33 +1100 Subject: mm: add VM_BUG_ON_PAGE() to page_mapcount() Add VM_BUG_ON_PAGE() for slab pages. _mapcount is an union with slab struct in struct page, so we must avoid accessing _mapcount if this page is a slab page. Also remove the unneeded bracket. Signed-off-by: Yalin Wang Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 65b9d7b2a4cf..f189197363d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -477,7 +477,8 @@ static inline void page_mapcount_reset(struct page *page) static inline int page_mapcount(struct page *page) { - return atomic_read(&(page)->_mapcount) + 1; + VM_BUG_ON_PAGE(PageSlab(page), page); + return atomic_read(&page->_mapcount) + 1; } static inline int page_count(struct page *page) -- cgit v1.2.3 From fe67bd237164831f7be4ff817aedcb878e5c2aa5 Mon Sep 17 00:00:00 2001 From: "Wang, Yalin" Date: Fri, 30 Jan 2015 13:11:33 +1100 Subject: mm:add KPF_ZERO_PAGE flag for /proc/kpageflags Add KPF_ZERO_PAGE flag for zero_page, so that userspace processes can detect zero_page in /proc/kpageflags, and then do memory analysis more accurately. Signed-off-by: Yalin Wang Acked-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- Documentation/vm/pagemap.txt | 8 ++++++++ fs/proc/page.c | 16 +++++++++++++--- include/linux/huge_mm.h | 12 ++++++++++++ include/uapi/linux/kernel-page-flags.h | 1 + mm/huge_memory.c | 7 +------ tools/vm/page-types.c | 1 + 6 files changed, 36 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 5948e455c4d2..6fbd55ef6b45 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -62,6 +62,8 @@ There are three components to pagemap: 20. NOPAGE 21. KSM 22. THP + 23. BALLOON + 24. ZERO_PAGE Short descriptions to the page flags: @@ -102,6 +104,12 @@ Short descriptions to the page flags: 22. THP contiguous pages which construct transparent hugepages +23. BALLOON + balloon compaction page + +24. ZERO_PAGE + zero page for pfn_zero or huge_zero page + [IO related page flags] 1. ERROR IO error occurred 3. UPTODATE page has up-to-date data diff --git a/fs/proc/page.c b/fs/proc/page.c index 1e3187da1fed..7eee2d8b97d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page) * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || - PageAnon(compound_head(page)))) - u |= 1 << KPF_THP; + else if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + if (PageLRU(head) || PageAnon(head)) + u |= 1 << KPF_THP; + else if (is_huge_zero_page(head)) { + u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } + } else if (is_zero_pfn(page_to_pfn(page))) + u |= 1 << KPF_ZERO_PAGE; + /* * Caveats on high order pages: page->_count will only be set diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ad9051bab267..f10b20f05159 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -157,6 +157,13 @@ static inline int hpage_nr_pages(struct page *page) extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp); +extern struct page *huge_zero_page; + +static inline bool is_huge_zero_page(struct page *page) +{ + return ACCESS_ONCE(huge_zero_page) == page; +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -206,6 +213,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str return 0; } +static inline bool is_huge_zero_page(struct page *page) +{ + return false; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 2f96d233c980..a6c4962e5d46 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -32,6 +32,7 @@ #define KPF_KSM 21 #define KPF_THP 22 #define KPF_BALLOON 23 +#define KPF_ZERO_PAGE 24 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..889713180980 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -171,12 +171,7 @@ static int start_khugepaged(void) } static atomic_t huge_zero_refcount; -static struct page *huge_zero_page __read_mostly; - -static inline bool is_huge_zero_page(struct page *page) -{ - return ACCESS_ONCE(huge_zero_page) == page; -} +struct page *huge_zero_page __read_mostly; static inline bool is_huge_zero_pmd(pmd_t pmd) { diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 264fbc297e0b..8bdf16b8ba60 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -133,6 +133,7 @@ static const char * const page_flag_names[] = { [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", [KPF_BALLOON] = "o:balloon", + [KPF_ZERO_PAGE] = "z:zero_page", [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked", -- cgit v1.2.3 From 09fa080333515b340389f38d36519f5ad9100bf0 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:34 +1100 Subject: swap: remove unused mem_cgroup_uncharge_swapcache declaration The body of this function was removed by commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API"). Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/swap.h | 15 --------------- mm/shmem.c | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 34e8b60ab973..7067eca501e2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -437,16 +437,6 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -#ifdef CONFIG_MEMCG -extern void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); -#else -static inline void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) -{ -} -#endif - #else /* CONFIG_SWAP */ #define swap_address_space(entry) (NULL) @@ -547,11 +537,6 @@ static inline swp_entry_t get_swap_page(void) return entry; } -static inline void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) -{ -} - #endif /* CONFIG_SWAP */ #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/shmem.c b/mm/shmem.c index 05a2d70a9244..f69d296bd0a3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1131,7 +1131,7 @@ repeat: * truncated or holepunched since swap was confirmed. * shmem_undo_range() will have done some of the * unaccounting, now delete_from_swap_cache() will do - * the rest (including mem_cgroup_uncharge_swapcache). + * the rest. * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ -- cgit v1.2.3 From c4110c87285377497ec2347cb7eb45707f1dbb87 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Jan 2015 13:11:34 +1100 Subject: mm: memcontrol: track move_lock state internally The complexity of memcg page stat synchronization is currently leaking into the callsites, forcing them to keep track of the move_lock state and the IRQ flags. Simplify the API by tracking it in the memcg. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++-- mm/memcontrol.c | 68 ++++++++++++++++++++++++++-------------------- mm/page-writeback.c | 12 +++----- mm/rmap.c | 12 +++----- 4 files changed, 49 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fb212e1d700d..04d3c2028782 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -138,12 +138,10 @@ static inline bool mem_cgroup_disabled(void) return false; } -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, - unsigned long *flags); -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, - unsigned long *flags); +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page); void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val); +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 45d47c2ddda5..a6140c0764f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -325,9 +325,11 @@ struct mem_cgroup { /* * set > 0 if pages under this cgroup are moving to other cgroup. */ - atomic_t moving_account; + atomic_t moving_account; /* taken only while moving_account > 0 */ - spinlock_t move_lock; + spinlock_t move_lock; + struct task_struct *move_lock_task; + unsigned long move_lock_flags; /* * percpu counter. */ @@ -1977,34 +1979,33 @@ cleanup: /** * mem_cgroup_begin_page_stat - begin a page state statistics transaction * @page: page that is going to change accounted state - * @locked: &memcg->move_lock slowpath was taken - * @flags: IRQ-state flags for &memcg->move_lock * * This function must mark the beginning of an accounted page state * change to prevent double accounting when the page is concurrently * being moved to another memcg: * - * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * memcg = mem_cgroup_begin_page_stat(page); * if (TestClearPageState(page)) * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg, locked, flags); - * - * The RCU lock is held throughout the transaction. The fast path can - * get away without acquiring the memcg->move_lock (@locked is false) - * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when the page - * state that is going to change is the only thing preventing the page - * from being uncharged. E.g. end-writeback clearing PageWriteback(), - * which allows migration to go ahead and uncharge the page before the - * account transaction might be complete. + * mem_cgroup_end_page_stat(memcg); */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, - bool *locked, - unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) { struct mem_cgroup *memcg; + unsigned long flags; + /* + * The RCU lock is held throughout the transaction. The fast + * path can get away without acquiring the memcg->move_lock + * because page moving starts with an RCU grace period. + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page from being uncharged. + * E.g. end-writeback clearing PageWriteback(), which allows + * migration to go ahead and uncharge the page before the + * account transaction might be complete. + */ rcu_read_lock(); if (mem_cgroup_disabled()) @@ -2014,16 +2015,22 @@ again: if (unlikely(!memcg)) return NULL; - *locked = false; if (atomic_read(&memcg->moving_account) <= 0) return memcg; - spin_lock_irqsave(&memcg->move_lock, *flags); + spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { - spin_unlock_irqrestore(&memcg->move_lock, *flags); + spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } - *locked = true; + + /* + * When charge migration first begins, we can have locked and + * unlocked page stat updates happening concurrently. Track + * the task who has the lock for mem_cgroup_end_page_stat(). + */ + memcg->move_lock_task = current; + memcg->move_lock_flags = flags; return memcg; } @@ -2031,14 +2038,17 @@ again: /** * mem_cgroup_end_page_stat - finish a page state statistics transaction * @memcg: the memcg that was accounted against - * @locked: value received from mem_cgroup_begin_page_stat() - * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, - unsigned long *flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { - if (memcg && *locked) - spin_unlock_irqrestore(&memcg->move_lock, *flags); + if (memcg && memcg->move_lock_task == current) { + unsigned long flags = memcg->move_lock_flags; + + memcg->move_lock_task = NULL; + memcg->move_lock_flags = 0; + + spin_unlock_irqrestore(&memcg->move_lock, flags); + } rcu_read_unlock(); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4335238e33..fb71e9deca85 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2308,12 +2308,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - unsigned long memcg_flags; struct mem_cgroup *memcg; - bool locked; int ret; - memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2338,19 +2336,17 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - unsigned long memcg_flags; struct mem_cgroup *memcg; - bool locked; int ret; - memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2380,7 +2376,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 70b32498d4f2..5e3e09081164 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page) { struct mem_cgroup *memcg; - unsigned long flags; - bool locked; - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg, &locked, &flags); + mem_cgroup_end_page_stat(memcg); } static void page_remove_file_rmap(struct page *page) { struct mem_cgroup *memcg; - unsigned long flags; - bool locked; - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg, &locked, &flags); + mem_cgroup_end_page_stat(memcg); } /** -- cgit v1.2.3 From 889630241f7a207a47181e32f5fcfc9aaced35bb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Jan 2015 13:11:35 +1100 Subject: mm: memcontrol: track move_lock state internally fix Update the !CONFIG_MEMCG page stat dummy API. Signed-off-by: Johannes Weiner Reported-by: Wu Fengguang Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 04d3c2028782..76b4084b8d08 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -283,14 +283,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, - bool *locked, unsigned long *flags) +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) { return NULL; } -static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, - bool *locked, unsigned long *flags) +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { } -- cgit v1.2.3 From f943d5b8698645d7967b9b548456aa3aa8a24225 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 30 Jan 2015 13:11:35 +1100 Subject: mm: fix typo of MIGRATE_RESERVE in comment Found it when I want to jump to the definition of MIGRATE_RESERVE ctags. Signed-off-by: Baoquan He Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2f0856d14b21..b41829701334 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -426,7 +426,7 @@ struct zone { const char *name; /* - * Number of MIGRATE_RESEVE page block. To maintain for just + * Number of MIGRATE_RESERVE page block. To maintain for just * optimization. Protected by zone->lock. */ int nr_migrate_reserve_block; -- cgit v1.2.3 From 710e0faa96dbd341c4f0fb29621744f79f03e127 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 30 Jan 2015 13:11:36 +1100 Subject: mm/hugetlb: take page table lock in follow_huge_pmd() We have a race condition between move_pages() and freeing hugepages, where move_pages() calls follow_page(FOLL_GET) for hugepages internally and tries to get its refcount without preventing concurrent freeing. This race crashes the kernel, so this patch fixes it by moving FOLL_GET code for hugepages into follow_huge_pmd() with taking the page table lock. This patch intentionally removes page==NULL check after pte_page. This is justified because pte_page() never returns NULL for any architectures or configurations. This patch changes the behavior of follow_huge_pmd() for tail pages and then tail pages can be pinned/returned. So the caller must be changed to properly handle the returned tail pages. We could have a choice to add the similar locking to follow_huge_(addr|pud) for consistency, but it's not necessary because currently these functions don't support FOLL_GET flag, so let's leave it for future development. Here is the reproducer: $ cat movepages.c #include #include #include #define ADDR_INPUT 0x700000000000UL #define HPS 0x200000 #define PS 0x1000 int main(int argc, char *argv[]) { int i; int nr_hp = strtol(argv[1], NULL, 0); int nr_p = nr_hp * HPS / PS; int ret; void **addrs; int *status; int *nodes; pid_t pid; pid = strtol(argv[2], NULL, 0); addrs = malloc(sizeof(char *) * nr_p + 1); status = malloc(sizeof(char *) * nr_p + 1); nodes = malloc(sizeof(char *) * nr_p + 1); while (1) { for (i = 0; i < nr_p; i++) { addrs[i] = (void *)ADDR_INPUT + i * PS; nodes[i] = 1; status[i] = 0; } ret = numa_move_pages(pid, nr_p, addrs, nodes, status, MPOL_MF_MOVE_ALL); if (ret == -1) err("move_pages"); for (i = 0; i < nr_p; i++) { addrs[i] = (void *)ADDR_INPUT + i * PS; nodes[i] = 0; status[i] = 0; } ret = numa_move_pages(pid, nr_p, addrs, nodes, status, MPOL_MF_MOVE_ALL); if (ret == -1) err("move_pages"); } return 0; } $ cat hugepage.c #include #include #include #define ADDR_INPUT 0x700000000000UL #define HPS 0x200000 int main(int argc, char *argv[]) { int nr_hp = strtol(argv[1], NULL, 0); char *p; while (1) { p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (p != (void *)ADDR_INPUT) { perror("mmap"); break; } memset(p, 0, nr_hp * HPS); munmap(p, nr_hp * HPS); } } $ sysctl vm.nr_hugepages=40 $ ./hugepage 10 & $ ./movepages 10 $(pgrep -f hugepage) Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()") Signed-off-by: Naoya Horiguchi Reported-by: Hugh Dickins Cc: James Hogan Cc: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Luiz Capitulino Cc: Nishanth Aravamudan Cc: Lee Schermerhorn Cc: Steve Capper Cc: [3.12+] Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 ++++---- include/linux/swapops.h | 4 ++++ mm/gup.c | 25 ++++++++----------------- mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++-------------- mm/migrate.c | 5 +++-- 5 files changed, 53 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d7856359920..7b5785032049 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write); + pmd_t *pmd, int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write); + pud_t *pud, int flags); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pmd); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) static inline void hugetlb_show_meminfo(void) { } -#define follow_huge_pmd(mm, addr, pmd, write) NULL -#define follow_huge_pud(mm, addr, pud, write) NULL +#define follow_huge_pmd(mm, addr, pmd, flags) NULL +#define follow_huge_pud(mm, addr, pud, flags) NULL #define prepare_hugepage_range(file, addr, len) (-EINVAL) #define pmd_huge(x) 0 #define pud_huge(x) 0 diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 50cbc876be56..831a3168ab35 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry) *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); } +extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, @@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp) } #define migration_entry_to_page(swp) NULL static inline void make_migration_entry_read(swp_entry_t *entryp) { } +static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, diff --git a/mm/gup.c b/mm/gup.c index 12bc2bc33da7..1a8ab05918e0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - return NULL; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - return page; + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); @@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_none(*pmd)) return no_page_table(vma, flags); if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else - page = NULL; - } - return page; + page = follow_huge_pmd(mm, address, pmd, flags); + if (page) + return page; + return no_page_table(vma, flags); } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) return no_page_table(vma, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d96b8bfa748f..5aca3707450f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, struct page * __weak follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) + pmd_t *pmd, int flags) { - struct page *page; - - if (!pmd_present(*pmd)) - return NULL; - page = pte_page(*(pte_t *)pmd); - if (page) - page += ((address & ~PMD_MASK) >> PAGE_SHIFT); + struct page *page = NULL; + spinlock_t *ptl; +retry: + ptl = pmd_lockptr(mm, pmd); + spin_lock(ptl); + /* + * make sure that the address range covered by this pmd is not + * unmapped from other threads. + */ + if (!pmd_huge(*pmd)) + goto out; + if (pmd_present(*pmd)) { + page = pte_page(*(pte_t *)pmd) + + ((address & ~PMD_MASK) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { + if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pmd, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); return page; } struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) + pud_t *pud, int flags) { - struct page *page; + if (flags & FOLL_GET) + return NULL; - page = pte_page(*(pte_t *)pud); - if (page) - page += ((address & ~PUD_MASK) >> PAGE_SHIFT); - return page; + return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } #ifdef CONFIG_MEMORY_FAILURE diff --git a/mm/migrate.c b/mm/migrate.c index 6e284bcca8bb..f98067e5d353 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, +void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { pte_t pte; @@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; if (PageHuge(page)) { - isolate_huge_page(page, &pagelist); + if (PageHead(page)) + isolate_huge_page(page, &pagelist); goto put_and_set; } -- cgit v1.2.3 From cadf6d07093a905bba9a24946203a7cd52854db3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:11:38 +1100 Subject: mm: numa: do not dereference pmd outside of the lock during NUMA hinting fault Automatic NUMA balancing depends on being able to protect PTEs to trap a fault and gather reference locality information. Very broadly speaking it would mark PTEs as not present and use another bit to distinguish between NUMA hinting faults and other types of faults. It was universally loved by everybody and caused no problems whatsoever. That last sentence might be a lie. This series is very heavily based on patches from Linus and Aneesh to replace the existing PTE/PMD NUMA helper functions with normal change protections. I did alter and add parts of it but I consider them relatively minor contributions. At their suggestion, acked-bys are in there but I've no problem converting them to Signed-off-by if requested. AFAIK, this has received no testing on ppc64 and I'm depending on Aneesh for that. I tested trinity under kvm-tool and passed and ran a few other basic tests. At the time of writing, only the short-lived tests have completed but testing of V2 indicated that long-term testing had no surprises. In most cases I'm leaving out detail as it's not that interesting. specjbb single JVM: There was negligible performance difference in the benchmark itself for short runs. However, system activity is higher and interrupts are much higher over time -- possibly TLB flushes. Migrations are also higher. Overall, this is more overhead but considering the problems faced with the old approach I think we just have to suck it up and find another way of reducing the overhead. specjbb multi JVM: Negligible performance difference to the actual benchmark but like the single JVM case, the system overhead is noticeably higher. Again, interrupts are a major factor. autonumabench: This was all over the place and about all that can be reasonably concluded is that it's different but not necessarily better or worse. autonumabench 3.18.0-rc5 3.18.0-rc5 mmotm-20141119 protnone-v3r3 User NUMA01 32380.24 ( 0.00%) 21642.92 ( 33.16%) User NUMA01_THEADLOCAL 22481.02 ( 0.00%) 22283.22 ( 0.88%) User NUMA02 3137.00 ( 0.00%) 3116.54 ( 0.65%) User NUMA02_SMT 1614.03 ( 0.00%) 1543.53 ( 4.37%) System NUMA01 322.97 ( 0.00%) 1465.89 (-353.88%) System NUMA01_THEADLOCAL 91.87 ( 0.00%) 49.32 ( 46.32%) System NUMA02 37.83 ( 0.00%) 14.61 ( 61.38%) System NUMA02_SMT 7.36 ( 0.00%) 7.45 ( -1.22%) Elapsed NUMA01 716.63 ( 0.00%) 599.29 ( 16.37%) Elapsed NUMA01_THEADLOCAL 553.98 ( 0.00%) 539.94 ( 2.53%) Elapsed NUMA02 83.85 ( 0.00%) 83.04 ( 0.97%) Elapsed NUMA02_SMT 86.57 ( 0.00%) 79.15 ( 8.57%) CPU NUMA01 4563.00 ( 0.00%) 3855.00 ( 15.52%) CPU NUMA01_THEADLOCAL 4074.00 ( 0.00%) 4136.00 ( -1.52%) CPU NUMA02 3785.00 ( 0.00%) 3770.00 ( 0.40%) CPU NUMA02_SMT 1872.00 ( 0.00%) 1959.00 ( -4.65%) System CPU usage of NUMA01 is worse but it's an adverse workload on this machine so I'm reluctant to conclude that it's a problem that matters. On the other workloads that are sensible on this machine, system CPU usage is great. Overall time to complete the benchmark is comparable 3.18.0-rc5 3.18.0-rc5 mmotm-20141119protnone-v3r3 User 59612.50 48586.44 System 460.22 1537.45 Elapsed 1442.20 1304.29 NUMA alloc hit 5075182 5743353 NUMA alloc miss 0 0 NUMA interleave hit 0 0 NUMA alloc local 5075174 5743339 NUMA base PTE updates 637061448 443106883 NUMA huge PMD updates 1243434 864747 NUMA page range updates 1273699656 885857347 NUMA hint faults 1658116 1214277 NUMA hint local faults 959487 754113 NUMA hint local percent 57 62 NUMA pages migrated 5467056 61676398 The NUMA pages migrated look terrible but when I looked at a graph of the activity over time I see that the massive spike in migration activity was during NUMA01. This correlates with high system CPU usage and could be simply down to bad luck but any modifications that affect that workload would be related to scan rates and migrations, not the protection mechanism. For all other workloads, migration activity was comparable. Overall, headline performance figures are comparable but the overhead is higher, mostly in interrupts. To some extent, higher overhead from this approach was anticipated but not to this degree. It's going to be necessary to reduce this again with a separate series in the future. It's still worth going ahead with this series though as it's likely to avoid constant headaches with Xen and is probably easier to maintain. This patch (of 10): A transhuge NUMA hinting fault may find the page is migrating and should wait until migration completes. The check is race-prone because the pmd is deferenced outside of the page lock and while the race is tiny, it'll be larger if the PMD is cleared while marking PMDs for hinting fault. This patch closes the race. Signed-off-by: Mel Gorman Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Linus Torvalds Cc: Paul Mackerras Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton --- include/linux/migrate.h | 4 ---- mm/huge_memory.c | 3 ++- mm/migrate.c | 6 ------ 3 files changed, 2 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fab9b32ace8e..78baed5f2952 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -67,7 +67,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #ifdef CONFIG_NUMA_BALANCING extern bool pmd_trans_migrating(pmd_t pmd); -extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); @@ -76,9 +75,6 @@ static inline bool pmd_trans_migrating(pmd_t pmd) { return false; } -static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) -{ -} static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 889713180980..49c3d6704af2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1278,8 +1278,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * check_same as the page may no longer be mapped. */ if (unlikely(pmd_trans_migrating(*pmdp))) { + page = pmd_page(*pmdp); spin_unlock(ptl); - wait_migrate_huge_page(vma->anon_vma, pmdp); + wait_on_page_locked(page); goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index f98067e5d353..5e8f03a8de2a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) -{ - struct page *page = pmd_page(*pmd); - wait_on_page_locked(page); -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on -- cgit v1.2.3 From ecfca2430307c7c9f1e6dfe8c8ea3c8468956035 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:11:38 +1100 Subject: mm: add p[te|md] protnone helpers for use by NUMA balancing This is a preparatory patch that introduces protnone helpers for automatic NUMA balancing. Signed-off-by: Mel Gorman Acked-by: Linus Torvalds Acked-by: Aneesh Kumar K.V Tested-by: Sasha Levin Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Paul Mackerras Cc: Rik van Riel Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 16 ++++++++++++++++ arch/x86/include/asm/pgtable.h | 16 ++++++++++++++++ include/asm-generic/pgtable.h | 20 ++++++++++++++++++++ 3 files changed, 52 insertions(+) (limited to 'include') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 48c9a50e1151..e192686b3c6a 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -38,6 +38,22 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h . On powerpc, this will only + * work for user pages and always return true for kernel pages. + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_val(pte) & + (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pte_protnone(pmd_pte(pmd)); +} + static inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_NUMA_MASK; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0fe03f834fb1..f519b0b529dd 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -483,6 +483,22 @@ static inline int pmd_present(pmd_t pmd) _PAGE_NUMA); } +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PROTNONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_PROTNONE; +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 129de9204d18..067922c06c29 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -673,6 +673,26 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +#ifndef CONFIG_NUMA_BALANCING +/* + * Technically a PTE can be PROTNONE even when not doing NUMA balancing but + * the only case the kernel cares is for NUMA balancing and is only ever set + * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked + * _PAGE_PROTNONE so by by default, implement the helper as "always no". It + * is the responsibility of the caller to distinguish between PROT_NONE + * protections and NUMA hinting fault protections. + */ +static inline int pte_protnone(pte_t pte) +{ + return 0; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + #ifdef CONFIG_NUMA_BALANCING /* * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that -- cgit v1.2.3 From 41037e40ec1d84cc1457d721f5d6f4ec214f23ac Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:11:38 +1100 Subject: mm: convert p[te|md]_numa users to p[te|md]_protnone_numa Convert existing users of pte_numa and friends to the new helper. Note that the kernel is broken after this patch is applied until the other page table modifiers are also altered. This patch layout is to make review easier. Signed-off-by: Mel Gorman Acked-by: Linus Torvalds Acked-by: Aneesh Kumar Acked-by: Benjamin Herrenschmidt Tested-by: Sasha Levin Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Paul Mackerras Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- arch/powerpc/mm/fault.c | 5 ----- arch/powerpc/mm/pgtable.c | 11 ++++++++--- arch/powerpc/mm/pgtable_64.c | 3 ++- arch/x86/mm/gup.c | 4 ++-- include/uapi/linux/mempolicy.h | 2 +- mm/gup.c | 10 +++++----- mm/huge_memory.c | 16 ++++++++-------- mm/memory.c | 4 ++-- mm/mprotect.c | 38 ++++++++++--------------------------- mm/pgtable-generic.c | 2 +- 11 files changed, 40 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 510bdfbc4073..625407e4d3b0 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -212,7 +212,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); - if (pte_present(pte) && !pte_numa(pte)) { + if (pte_present(pte) && !pte_protnone(pte)) { if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 6154b0a2b063..f38327b95f76 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -398,8 +398,6 @@ good_area: * processors use the same I/D cache coherency mechanism * as embedded. */ - if (error_code & DSISR_PROTFAULT) - goto bad_area; #endif /* CONFIG_PPC_STD_MMU */ /* @@ -423,9 +421,6 @@ good_area: flags |= FAULT_FLAG_WRITE; /* a read */ } else { - /* protection fault */ - if (error_code & 0x08000000) - goto bad_area; if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index c90e602677c9..83dfcb55ffef 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -172,9 +172,14 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { -#ifdef CONFIG_DEBUG_VM - WARN_ON(pte_val(*ptep) & _PAGE_PRESENT); -#endif + /* + * When handling numa faults, we already have the pte marked + * _PAGE_PRESENT, but we can be sure that it is not in hpte. + * Hence we can use set_pte_at for them. + */ + VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER)); + /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8526c5896c94..6957cc1ca0a7 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -718,7 +718,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT); + WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER)); assert_spin_locked(&mm->page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 224b14235e96..03740114a1f5 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_numa(pte)) { + if (pte_protnone(pte)) { pte_unmap(ptep); return 0; } @@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 0d11c3dcd3a1..9cd8b21dddbe 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -67,7 +67,7 @@ enum mpol_rebind_step { #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ -#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ +#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/gup.c b/mm/gup.c index 1a8ab05918e0..779b2f3a69fe 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -64,7 +64,7 @@ retry: migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_numa(pte)) + if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); @@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { @@ -731,10 +731,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, /* * Similar to the PMD case below, NUMA hinting must take slow - * path + * path using the pte_protnone check. */ if (!pte_present(pte) || pte_special(pte) || - pte_numa(pte) || (write && !pte_write(pte))) + pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -929,7 +929,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, write, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 49c3d6704af2..a56c32f111ae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1217,7 +1217,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) goto out; page = pmd_page(*pmd); @@ -1348,7 +1348,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Migrate the THP to the requested node, returns with page unlocked - * and pmd_numa cleared. + * and access rights restored. */ spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1363,7 +1363,7 @@ clear_pmdnuma: BUG_ON(!PageLocked(page)); pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_numa(*pmdp)); + VM_BUG_ON(pmd_protnone(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: @@ -1509,7 +1509,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ret = 1; if (!prot_numa) { entry = pmdp_get_and_clear_notify(mm, addr, pmd); - if (pmd_numa(entry)) + if (pmd_protnone(entry)) entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; @@ -1525,7 +1525,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * local vs remote hits on the zero page. */ if (!is_huge_zero_page(page) && - !pmd_numa(*pmd)) { + !pmd_protnone(*pmd)) { pmdp_set_numa(mm, addr, pmd); ret = HPAGE_PMD_NR; } @@ -1793,9 +1793,9 @@ static int __split_huge_page_map(struct page *page, pte_t *pte, entry; BUG_ON(PageCompound(page+i)); /* - * Note that pmd_numa is not transferred deliberately - * to avoid any possibility that pte_numa leaks to - * a PROT_NONE VMA by accident. + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/memory.c b/mm/memory.c index 988d3099a25d..a7bba6bc161a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3121,7 +3121,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } - if (pte_numa(entry)) + if (pte_protnone(entry)) return do_numa_page(mm, vma, address, entry, pte, pmd); ptl = pte_lockptr(mm, pmd); @@ -3199,7 +3199,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (pmd_trans_splitting(orig_pmd)) return 0; - if (pmd_numa(orig_pmd)) + if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); diff --git a/mm/mprotect.c b/mm/mprotect.c index 33121662f08b..44ffa698484d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -75,36 +75,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool updated = false; - if (!prot_numa) { - ptent = ptep_modify_prot_start(mm, addr, pte); - if (pte_numa(ptent)) - ptent = pte_mknonnuma(ptent); - ptent = pte_modify(ptent, newprot); - /* - * Avoid taking write faults for pages we - * know to be dirty. - */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) - ptent = pte_mkwrite(ptent); - ptep_modify_prot_commit(mm, addr, pte, ptent); - updated = true; - } else { - struct page *page; - - page = vm_normal_page(vma, addr, oldpte); - if (page && !PageKsm(page)) { - if (!pte_numa(oldpte)) { - ptep_set_numa(mm, addr, pte); - updated = true; - } - } + ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + + /* Avoid taking write faults for known dirty pages */ + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) { + ptent = pte_mkwrite(ptent); } - if (updated) - pages++; + ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index dfb79e028ecb..4b8ad760dde3 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -193,7 +193,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; - if (pmd_numa(entry)) + if (pmd_protnone(entry)) entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); -- cgit v1.2.3 From 76b869d160d7b0dfa88364eddaac7f9a6820d1a1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:11:39 +1100 Subject: mm: convert p[te|md]_mknonnuma and remaining page table manipulations With PROT_NONE, the traditional page table manipulation functions are sufficient. Signed-off-by: Mel Gorman Acked-by: Linus Torvalds Acked-by: Aneesh Kumar Tested-by: Sasha Levin Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Paul Mackerras Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 +-- mm/huge_memory.c | 33 +++++++-------------------------- mm/memory.c | 10 ++++++---- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/mprotect.c | 2 +- mm/pgtable-generic.c | 2 -- 7 files changed, 17 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f10b20f05159..062bd252e994 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -31,8 +31,7 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, - int prot_numa); + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a56c32f111ae..add87b3cf94c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1361,9 +1361,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - pmd = pmd_mknonnuma(pmd); + pmd = pmd_modify(pmd, vma->vm_page_prot); set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_protnone(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: @@ -1498,7 +1497,7 @@ out: * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -1507,29 +1506,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; ret = 1; - if (!prot_numa) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); - if (pmd_protnone(entry)) - entry = pmd_mknonnuma(entry); - entry = pmd_modify(entry, newprot); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(pmd_write(entry)); - } else { - struct page *page = pmd_page(*pmd); - - /* - * Do not trap faults against the zero page. The - * read-only data is likely to be read-cached on the - * local CPU cache and it is less useful to know about - * local vs remote hits on the zero page. - */ - if (!is_huge_zero_page(page) && - !pmd_protnone(*pmd)) { - pmdp_set_numa(mm, addr, pmd); - ret = HPAGE_PMD_NR; - } - } + entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(pmd_write(entry)); spin_unlock(ptl); } diff --git a/mm/memory.c b/mm/memory.c index a7bba6bc161a..1b2013b2f95e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3015,9 +3015,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. * - * ptep_modify_prot_start is not called as this is clearing - * the _PAGE_NUMA bit and it is not really expected that there - * would be concurrent hardware modifications to the PTE. + * We can safely just do a "set_pte_at()", because the old + * page table entry is not accessible, so there would be no + * concurrent hardware modifications to the PTE. */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); @@ -3026,7 +3026,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } - pte = pte_mknonnuma(pte); + /* Make it present again */ + pte = pte_modify(pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..4fcbf12deaa1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -627,7 +627,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/migrate.c b/mm/migrate.c index 5e8f03a8de2a..85e042686031 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1847,7 +1847,7 @@ out_fail: out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { - entry = pmd_mknonnuma(entry); + entry = pmd_modify(entry, vma->vm_page_prot); set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 44ffa698484d..76824d73380d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -142,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, split_huge_page_pmd(vma, addr, pmd); else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + newprot); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4b8ad760dde3..c25f94b33811 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; - if (pmd_protnone(entry)) - entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } -- cgit v1.2.3 From a31dbf1afcd45534e52b639045e42bad3d14ccc2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:11:39 +1100 Subject: mm: remove remaining references to NUMA hinting bits and helpers This patch removes the NUMA PTE bits and associated helpers. As a side-effect it increases the maximum possible swap space on x86-64. One potential source of problems is races between the marking of PTEs PROT_NONE, NUMA hinting faults and migration. It must be guaranteed that a PTE being protected is not faulted in parallel, seen as a pte_none and corrupting memory. The base case is safe but transhuge has problems in the past due to an different migration mechanism and a dependance on page lock to serialise migrations and warrants a closer look. task_work hinting update parallel fault ------------------------ -------------- change_pmd_range change_huge_pmd __pmd_trans_huge_lock pmdp_get_and_clear __handle_mm_fault pmd_none do_huge_pmd_anonymous_page read? pmd_lock blocks until hinting complete, fail !pmd_none test write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none pmd_modify set_pmd_at task_work hinting update parallel migration ------------------------ ------------------ change_pmd_range change_huge_pmd __pmd_trans_huge_lock pmdp_get_and_clear __handle_mm_fault do_huge_pmd_numa_page migrate_misplaced_transhuge_page pmd_lock waits for updates to complete, recheck pmd_same pmd_modify set_pmd_at Both of those are safe and the case where a transhuge page is inserted during a protection update is unchanged. The case where two processes try migrating at the same time is unchanged by this series so should still be ok. I could not find a case where we are accidentally depending on the PTE not being cleared and flushed. If one is missed, it'll manifest as corruption problems that start triggering shortly after this series is merged and only happen when NUMA balancing is enabled. Signed-off-by: Mel Gorman Tested-by: Sasha Levin Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Linus Torvalds Cc: Paul Mackerras Cc: Rik van Riel Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 54 +----------- arch/powerpc/include/asm/pte-common.h | 5 -- arch/powerpc/include/asm/pte-hash64.h | 6 -- arch/x86/include/asm/pgtable.h | 22 +---- arch/x86/include/asm/pgtable_64.h | 5 -- arch/x86/include/asm/pgtable_types.h | 41 +-------- include/asm-generic/pgtable.h | 155 ---------------------------------- include/linux/swapops.h | 2 +- 8 files changed, 7 insertions(+), 283 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index e192686b3c6a..7aeb854f0251 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -53,64 +53,12 @@ static inline int pmd_protnone(pmd_t pmd) { return pte_protnone(pmd_pte(pmd)); } - -static inline int pte_present(pte_t pte) -{ - return pte_val(pte) & _PAGE_NUMA_MASK; -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t pte) -{ - return pte_val(pte) & (_PAGE_PRESENT); -} - -#define ptep_set_numa ptep_set_numa -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - if ((pte_val(*ptep) & _PAGE_PRESENT) == 0) - VM_BUG_ON(1); - - pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); - return; -} - -#define pmdp_set_numa pmdp_set_numa -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0) - VM_BUG_ON(1); - - pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); - return; -} - -/* - * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist - * which was inherited from x86. For the purposes of powerpc pte_basic_t and - * pmd_t are equivalent - */ -#define pteval_t pte_basic_t -#define pmdval_t pmd_t -static inline pteval_t ptenuma_flags(pte_t pte) -{ - return pte_val(pte) & _PAGE_NUMA_MASK; -} - -static inline pmdval_t pmdnuma_flags(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_NUMA_MASK; -} - -# else +#endif /* CONFIG_NUMA_BALANCING */ static inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_PRESENT; } -#endif /* CONFIG_NUMA_BALANCING */ /* Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index e040c3595129..8d1569c29042 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -98,11 +98,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); _PAGE_USER | _PAGE_ACCESSED | \ _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) -#ifdef CONFIG_NUMA_BALANCING -/* Mask of bits that distinguish present and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT) -#endif - /* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 1a66c25eeac2..fc852f7e7b3a 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -26,12 +26,6 @@ #define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ -/* - * Used for tracking numa faults - */ -#define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */ - - /* No separate kernel read-only */ #define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f519b0b529dd..34d42a7d5595 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_PRESENT); + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -442,13 +442,6 @@ static inline int pte_same(pte_t a, pte_t b) } static inline int pte_present(pte_t a) -{ - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } @@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) if (pte_flags(a) & _PAGE_PRESENT) return true; - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; @@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } #ifdef CONFIG_NUMA_BALANCING @@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { -#ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) - return 0; -#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } @@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #endif diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e227970f983e..2ee781114d34 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* Encode and de-code a swap entry */ #define SWP_TYPE_BITS 5 -#ifdef CONFIG_NUMA_BALANCING -/* Automatic NUMA balancing needs to be distinguishable from swap entries */ -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) -#else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 5185a4f599ec..da1cd67d87a4 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -27,14 +27,6 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Swap offsets on configurations that allow automatic NUMA balancing use the - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the - * maximum possible swap space from 16TB to 8TB. - */ -#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) - /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -75,21 +67,6 @@ #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif -/* - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page - * that is not present. The hinting fault gathers numa placement statistics - * (see pte_numa()). The bit is always zero when the PTE is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - */ -#ifdef CONFIG_NUMA_BALANCING -#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) -#else -#define _PAGE_NUMA (_AT(pteval_t, 0)) -#endif - /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict @@ -122,8 +99,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_NUMA) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) + _PAGE_SOFT_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* * The cache modes defined here are used to translate between pure SW usage @@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -#ifdef CONFIG_NUMA_BALANCING -/* Set of bits that distinguishes present, prot_none and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) -static inline pteval_t ptenuma_flags(pte_t pte) -{ - return pte_flags(pte) & _PAGE_NUMA_MASK; -} - -static inline pmdval_t pmdnuma_flags(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_NUMA_MASK; -} -#endif /* CONFIG_NUMA_BALANCING */ - #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 067922c06c29..4d46085c1b90 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -244,10 +244,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) # define pte_accessible(mm, pte) ((void)(pte), 1) #endif -#ifndef pte_present_nonuma -#define pte_present_nonuma(pte) pte_present(pte) -#endif - #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -693,157 +689,6 @@ static inline int pmd_protnone(pmd_t pmd) } #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_NUMA_BALANCING -/* - * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that - * is protected for PROT_NONE and a NUMA hinting fault entry. If the - * architecture defines __PAGE_PROTNONE then it should take that into account - * but those that do not can rely on the fact that the NUMA hinting scanner - * skips inaccessible VMAs. - * - * pte/pmd_present() returns true if pte/pmd_numa returns true. Page - * fault triggers on those regions if pte/pmd_numa returns true - * (because _PAGE_PRESENT is not set). - */ -#ifndef pte_numa -static inline int pte_numa(pte_t pte) -{ - return ptenuma_flags(pte) == _PAGE_NUMA; -} -#endif - -#ifndef pmd_numa -static inline int pmd_numa(pmd_t pmd) -{ - return pmdnuma_flags(pmd) == _PAGE_NUMA; -} -#endif - -/* - * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically - * because they're called by the NUMA hinting minor page fault. If we - * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler - * would be forced to set it later while filling the TLB after we - * return to userland. That would trigger a second write to memory - * that we optimize away by setting _PAGE_ACCESSED here. - */ -#ifndef pte_mknonnuma -static inline pte_t pte_mknonnuma(pte_t pte) -{ - pteval_t val = pte_val(pte); - - val &= ~_PAGE_NUMA; - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); - return __pte(val); -} -#endif - -#ifndef pmd_mknonnuma -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - pmdval_t val = pmd_val(pmd); - - val &= ~_PAGE_NUMA; - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); - - return __pmd(val); -} -#endif - -#ifndef pte_mknuma -static inline pte_t pte_mknuma(pte_t pte) -{ - pteval_t val = pte_val(pte); - - VM_BUG_ON(!(val & _PAGE_PRESENT)); - - val &= ~_PAGE_PRESENT; - val |= _PAGE_NUMA; - - return __pte(val); -} -#endif - -#ifndef ptep_set_numa -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_t ptent = *ptep; - - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, ptep, ptent); - return; -} -#endif - -#ifndef pmd_mknuma -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - pmdval_t val = pmd_val(pmd); - - val &= ~_PAGE_PRESENT; - val |= _PAGE_NUMA; - - return __pmd(val); -} -#endif - -#ifndef pmdp_set_numa -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - pmd_t pmd = *pmdp; - - pmd = pmd_mknuma(pmd); - set_pmd_at(mm, addr, pmdp, pmd); - return; -} -#endif -#else -static inline int pmd_numa(pmd_t pmd) -{ - return 0; -} - -static inline int pte_numa(pte_t pte) -{ - return 0; -} - -static inline pte_t pte_mknonnuma(pte_t pte) -{ - return pte; -} - -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - return pmd; -} - -static inline pte_t pte_mknuma(pte_t pte) -{ - return pte; -} - -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - return; -} - - -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - return pmd; -} - -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - return ; -} -#endif /* CONFIG_NUMA_BALANCING */ - #endif /* CONFIG_MMU */ #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 831a3168ab35..73ca28070a92 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present_nonuma(pte); + return !pte_none(pte); } #endif -- cgit v1.2.3 From 1d5e85abd7c07e1843941d41b41606df67250ea3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:11:39 +1100 Subject: mm: numa: do not trap faults on the huge zero page Faults on the huge zero page are pointless and there is a BUG_ON to catch them during fault time. This patch reintroduces a check that avoids marking the zero page PAGE_NONE. Signed-off-by: Mel Gorman Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Linus Torvalds Cc: Paul Mackerras Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 ++- mm/huge_memory.c | 13 ++++++++++++- mm/memory.c | 1 - mm/mprotect.c | 14 +++++++++++++- 4 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 062bd252e994..f10b20f05159 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot); + unsigned long addr, pgprot_t newprot, + int prot_numa); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index add87b3cf94c..27f93c0a8697 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1497,7 +1497,7 @@ out: * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -1505,6 +1505,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; + + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) { + spin_unlock(ptl); + return 0; + } + ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); diff --git a/mm/memory.c b/mm/memory.c index 1b2013b2f95e..adbff9e97f07 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3037,7 +3037,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } - BUG_ON(is_zero_pfn(page_to_pfn(page))); /* * Avoid grouping on DSO/COW pages in specific and RO pages diff --git a/mm/mprotect.c b/mm/mprotect.c index 76824d73380d..dd599fc235c2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -76,6 +76,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_present(oldpte)) { pte_t ptent; + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (!page || PageKsm(page)) + continue; + } + ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); @@ -142,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, split_huge_page_pmd(vma, addr, pmd); else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot); + newprot, prot_numa); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { -- cgit v1.2.3 From 42d039dc0f0e71a22ae065821319c1cf5cd85685 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 30 Jan 2015 13:11:40 +1100 Subject: mm: reduce try_to_compact_pages parameters Expand the usage of the struct alloc_context introduced in the previous patch also for calling try_to_compact_pages(), to reduce the number of its parameters. Since the function is in different compilation unit, we need to move alloc_context definition in the shared mm/internal.h header. With this change we get simpler code and small savings of code size and stack usage: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27) function old new delta __alloc_pages_direct_compact 283 256 -27 add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-13 (-13) function old new delta try_to_compact_pages 582 569 -13 Stack usage of __alloc_pages_direct_compact goes from 24 to none (per scripts/checkstack.pl). Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Zhang Yanfei Cc: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- include/linux/compaction.h | 17 +++++++++-------- mm/compaction.c | 23 +++++++++++------------ mm/internal.h | 22 ++++++++++++++++++++++ mm/page_alloc.c | 27 ++------------------------- 4 files changed, 44 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 3238ffa33f68..f2efda2e6ac6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,6 +21,8 @@ /* Zone lock or lru_lock was contended in async compaction */ #define COMPACT_CONTENDED_LOCK 2 +struct alloc_context; /* in mm/internal.h */ + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -30,10 +32,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); extern int fragmentation_index(struct zone *zone, unsigned int order); -extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx); +extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, @@ -101,10 +102,10 @@ static inline bool compaction_restarting(struct zone *zone, int order) } #else -static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) +static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, + unsigned int order, int alloc_flags, + const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 546e571e9d60..9c7e6909dd29 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1335,22 +1335,20 @@ int sysctl_extfrag_threshold = 500; /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation - * @zonelist: The zonelist used for the current allocation - * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation - * @nodemask: The allowed nodes to allocate from + * @order: The order of the current allocation + * @alloc_flags: The allocation flags of the current allocation + * @ac: The context of current allocation * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that determines if compaction was aborted due to * need_resched() or lock contention * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) +unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { - enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; @@ -1365,8 +1363,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_SKIPPED; /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { int status; int zone_contended; @@ -1374,7 +1372,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, continue; status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended, alloc_flags, classzone_idx); + &zone_contended, alloc_flags, + ac->classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1384,7 +1383,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - classzone_idx, alloc_flags)) { + ac->classzone_idx, alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/internal.h b/mm/internal.h index efad241f7014..c4d6c9b43491 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -109,6 +109,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * in mm/page_alloc.c */ +/* + * Structure for holding the mostly immutable allocation parameters passed + * between functions involved in allocations, including the alloc_pages* + * family of functions. + * + * nodemask, migratetype and high_zoneidx are initialized only once in + * __alloc_pages_nodemask() and then never change. + * + * zonelist, preferred_zone and classzone_idx are set first in + * __alloc_pages_nodemask() for the fast path, and might be later changed + * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * by a const pointer. + */ +struct alloc_context { + struct zonelist *zonelist; + nodemask_t *nodemask; + struct zone *preferred_zone; + int classzone_idx; + int migratetype; + enum zone_type high_zoneidx; +}; + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4aead0bd8d44..d664eb922a7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -232,27 +232,6 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif -/* - * Structure for holding the mostly immutable allocation parameters passed - * between alloc_pages* family of functions. - * - * nodemask, migratetype and high_zoneidx are initialized only once in - * __alloc_pages_nodemask() and then never change. - * - * zonelist, preferred_zone and classzone_idx are set first in - * __alloc_pages_nodemask() for the fast path, and might be later changed - * in __alloc_pages_slowpath(). All other functions pass the whole strucure - * by a const pointer. - */ -struct alloc_context { - struct zonelist *zonelist; - nodemask_t *nodemask; - struct zone *preferred_zone; - int classzone_idx; - int migratetype; - enum zone_type high_zoneidx; -}; - int page_group_by_mobility_disabled __read_mostly; void set_pageblock_migratetype(struct page *page, int migratetype) @@ -2429,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(ac->zonelist, order, gfp_mask, - ac->nodemask, mode, - contended_compaction, - alloc_flags, ac->classzone_idx); + compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, contended_compaction); current->flags &= ~PF_MEMALLOC; switch (compact_result) { -- cgit v1.2.3 From f33bf1c5b6c87066877b45649ce167ad73dc3a8e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 30 Jan 2015 13:11:41 +1100 Subject: mm: microoptimize zonelist operations next_zones_zonelist() returns a zoneref pointer, as well as a zone pointer via extra parameter. Since the latter can be trivially obtained by dereferencing the former, the overhead of the extra parameter is unjustified. This patch thus removes the zone parameter from next_zones_zonelist(). Both callers happen to be in the same header file, so it's simple to add the zoneref dereference inline. We save some bytes of code size. add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-105 (-105) function old new delta nr_free_zone_pages 129 115 -14 __alloc_pages_nodemask 2300 2285 -15 get_page_from_freelist 2652 2576 -76 add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10) function old new delta try_to_compact_pages 569 579 +10 Signed-off-by: Vlastimil Babka Cc: Mel Gorman Cc: Zhang Yanfei Cc: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 13 +++++++------ mm/mmzone.c | 4 +--- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b41829701334..f279d9c158cd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -970,7 +970,6 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) * @z - The cursor used as a starting point for the search * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with - * @zone - The first suitable zone found is returned via this parameter * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the @@ -980,8 +979,7 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) */ struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, - nodemask_t *nodes, - struct zone **zone); + nodemask_t *nodes); /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist @@ -1000,8 +998,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, nodemask_t *nodes, struct zone **zone) { - return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, - zone); + struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs, + highest_zoneidx, nodes); + *zone = zonelist_zone(z); + return z; } /** @@ -1018,7 +1018,8 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ zone; \ - z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ + z = next_zones_zonelist(++z, highidx, nodemask), \ + zone = zonelist_zone(z)) \ /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index diff --git a/mm/mmzone.c b/mm/mmzone.c index bf34fb8556db..7d87ebb0d632 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) /* Returns the next zone at or below highest_zoneidx in a zonelist */ struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, - nodemask_t *nodes, - struct zone **zone) + nodemask_t *nodes) { /* * Find the next suitable zone to use for the allocation. @@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z); return z; } -- cgit v1.2.3 From 2f4d6b28f84d6ef2a02680d31787fb092d8dbc8b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:41 +1100 Subject: list_lru: introduce list_lru_shrink_{count,walk} Kmem accounting of memcg is unusable now, because it lacks slab shrinker support. That means when we hit the limit we will get ENOMEM w/o any chance to recover. What we should do then is to call shrink_slab, which would reclaim old inode/dentry caches from this cgroup. This is what this patch set is intended to do. Basically, it does two things. First, it introduces the notion of per-memcg slab shrinker. A shrinker that wants to reclaim objects per cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be passed the memory cgroup to scan from in shrink_control->memcg. For such shrinkers shrink_slab iterates over the whole cgroup subtree under the target cgroup and calls the shrinker for each kmem-active memory cgroup. Secondly, this patch set makes the list_lru structure per-memcg. It's done transparently to list_lru users - everything they have to do is to tell list_lru_init that they want memcg-aware list_lru. Then the list_lru will automatically distribute objects among per-memcg lists basing on which cgroup the object is accounted to. This way to make FS shrinkers (icache, dcache) memcg-aware we only need to make them use memcg-aware list_lru, and this is what this patch set does. As before, this patch set only enables per-memcg kmem reclaim when the pressure goes from memory.limit, not from memory.kmem.limit. Handling memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and it is still unclear whether we will have this knob in the unified hierarchy. This patch (of 9): NUMA aware slab shrinkers use the list_lru structure to distribute objects coming from different NUMA nodes to different lists. Whenever such a shrinker needs to count or scan objects from a particular node, it issues commands like this: count = list_lru_count_node(lru, sc->nid); freed = list_lru_walk_node(lru, sc->nid, isolate_func, isolate_arg, &sc->nr_to_scan); where sc is an instance of the shrink_control structure passed to it from vmscan. To simplify this, let's add special list_lru functions to be used by shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which consolidate the nid and nr_to_scan arguments in the shrink_control structure. This will also allow us to avoid patching shrinkers that use list_lru when we make shrink_slab() per-memcg - all we will have to do is extend the shrink_control structure to include the target memcg and make list_lru_shrink_{count,walk} handle this appropriately. Signed-off-by: Vladimir Davydov Suggested-by: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- fs/dcache.c | 14 ++++++-------- fs/gfs2/quota.c | 6 +++--- fs/inode.c | 7 +++---- fs/internal.h | 7 +++---- fs/super.c | 24 +++++++++++------------- fs/xfs/xfs_buf.c | 7 +++---- fs/xfs/xfs_qm.c | 7 +++---- include/linux/list_lru.h | 16 ++++++++++++++++ mm/workingset.c | 6 +++--- 9 files changed, 51 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/fs/dcache.c b/fs/dcache.c index e368d4f412f9..56c5da89f58a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -930,24 +930,22 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @nr_to_scan : number of entries to try to free - * @nid: which node to scan for freeable entities + * @sc: shrink control, passed to list_lru_shrink_walk() * - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is - * done when we need more memory an called from the superblock shrinker + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This + * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid) +long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; - freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, - &dispose, &nr_to_scan); + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, + dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c8b148bbdc8b..56db71d5c95f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -171,8 +171,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, - &dispose, &sc->nr_to_scan); + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, + gfs2_qd_isolate, &dispose); gfs2_qd_dispose(&dispose); @@ -182,7 +182,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } struct shrinker gfs2_qd_shrinker = { diff --git a/fs/inode.c b/fs/inode.c index c760fac33c92..b80b17a09d36 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -750,14 +750,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ -long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid) +long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; - freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, - &freeable, &nr_to_scan); + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, + inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } diff --git a/fs/internal.h b/fs/internal.h index e9a61fe67575..d92c346a793d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,6 +14,7 @@ struct file_system_type; struct linux_binprm; struct path; struct mount; +struct shrink_control; /* * block_dev.c @@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; -extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid); +extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* @@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool); */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); -extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid); +extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); /* * read_write.c diff --git a/fs/super.c b/fs/super.c index eae088f6aaae..4554ac257647 100644 --- a/fs/super.c +++ b/fs/super.c @@ -77,8 +77,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); - inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); - dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; @@ -86,20 +86,20 @@ static unsigned long super_cache_scan(struct shrinker *shrink, /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches */ - freed = prune_dcache_sb(sb, dentries, sc->nid); - freed += prune_icache_sb(sb, inodes, sc->nid); + sc->nr_to_scan = dentries; + freed = prune_dcache_sb(sb, sc); + sc->nr_to_scan = inodes; + freed += prune_icache_sb(sb, sc); - if (fs_objects) { - fs_objects = mult_frac(sc->nr_to_scan, fs_objects, - total_objects); + if (fs_objects) freed += sb->s_op->free_cached_objects(sb, fs_objects, sc->nid); - } drop_super(sb); return freed; @@ -118,17 +118,15 @@ static unsigned long super_cache_count(struct shrinker *shrink, * scalability bottleneck. The counts could get updated * between super_cache_count and super_cache_scan anyway. * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_count_node() and + * ensures the safety of call to list_lru_shrink_count() and * s_op->nr_cached_objects(). */ if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc->nid); - total_objects += list_lru_count_node(&sb->s_dentry_lru, - sc->nid); - total_objects += list_lru_count_node(&sb->s_inode_lru, - sc->nid); + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); total_objects = vfs_pressure_ratio(total_objects); return total_objects; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bb502a391792..15c9d224c721 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1583,10 +1583,9 @@ xfs_buftarg_shrink_scan( struct xfs_buftarg, bt_shrinker); LIST_HEAD(dispose); unsigned long freed; - unsigned long nr_to_scan = sc->nr_to_scan; - freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, - &dispose, &nr_to_scan); + freed = list_lru_shrink_walk(&btp->bt_lru, sc, + xfs_buftarg_isolate, &dispose); while (!list_empty(&dispose)) { struct xfs_buf *bp; @@ -1605,7 +1604,7 @@ xfs_buftarg_shrink_count( { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - return list_lru_count_node(&btp->bt_lru, sc->nid); + return list_lru_shrink_count(&btp->bt_lru, sc); } void diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 79fb19dd9c83..d77bf6d8312a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -523,7 +523,6 @@ xfs_qm_shrink_scan( struct xfs_qm_isolate isol; unsigned long freed; int error; - unsigned long nr_to_scan = sc->nr_to_scan; if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; @@ -531,8 +530,8 @@ xfs_qm_shrink_scan( INIT_LIST_HEAD(&isol.buffers); INIT_LIST_HEAD(&isol.dispose); - freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, - &nr_to_scan); + freed = list_lru_shrink_walk(&qi->qi_lru, sc, + xfs_qm_dquot_isolate, &isol); error = xfs_buf_delwri_submit(&isol.buffers); if (error) @@ -557,7 +556,7 @@ xfs_qm_shrink_count( struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); - return list_lru_count_node(&qi->qi_lru, sc->nid); + return list_lru_shrink_count(&qi->qi_lru, sc); } /* diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index f3434533fbf8..f500a2e39b13 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -9,6 +9,7 @@ #include #include +#include /* list_lru_walk_cb has to always return one of those */ enum lru_status { @@ -81,6 +82,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); * Callers that want such a guarantee need to provide an outer lock. */ unsigned long list_lru_count_node(struct list_lru *lru, int nid); + +static inline unsigned long list_lru_shrink_count(struct list_lru *lru, + struct shrink_control *sc) +{ + return list_lru_count_node(lru, sc->nid); +} + static inline unsigned long list_lru_count(struct list_lru *lru) { long count = 0; @@ -119,6 +127,14 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); +static inline unsigned long +list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, + list_lru_walk_cb isolate, void *cb_arg) +{ + return list_lru_walk_node(lru, sc->nid, isolate, cb_arg, + &sc->nr_to_scan); +} + static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) diff --git a/mm/workingset.c b/mm/workingset.c index f7216fa7da27..d4fa7fb10a52 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); pages = node_present_pages(sc->nid); @@ -376,8 +376,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, - shadow_lru_isolate, NULL, &sc->nr_to_scan); + ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, + shadow_lru_isolate, NULL); local_irq_enable(); return ret; } -- cgit v1.2.3 From e61f0d9309d1e1d3e65545f61f1e9989e5ed5f82 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:41 +1100 Subject: fs: consolidate {nr,free}_cached_objects args in shrink_control We are going to make FS shrinkers memcg-aware. To achieve that, we will have to pass the memcg to scan to the nr_cached_objects and free_cached_objects VFS methods, which currently take only the NUMA node to scan. Since the shrink_control structure already holds the node, and the memcg to scan will be added to it when we introduce memcg-aware vmscan, let us consolidate the methods' arguments in this structure to keep things clean. Signed-off-by: Vladimir Davydov Suggested-by: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- fs/super.c | 12 ++++++------ fs/xfs/xfs_super.c | 7 +++---- include/linux/fs.h | 6 ++++-- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index 4554ac257647..a2b735a42e74 100644 --- a/fs/super.c +++ b/fs/super.c @@ -75,7 +75,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, return SHRINK_STOP; if (sb->s_op->nr_cached_objects) - fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); + fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); @@ -97,9 +97,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink, sc->nr_to_scan = inodes; freed += prune_icache_sb(sb, sc); - if (fs_objects) - freed += sb->s_op->free_cached_objects(sb, fs_objects, - sc->nid); + if (fs_objects) { + sc->nr_to_scan = fs_objects; + freed += sb->s_op->free_cached_objects(sb, sc); + } drop_super(sb); return freed; @@ -122,8 +123,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, * s_op->nr_cached_objects(). */ if (sb->s_op && sb->s_op->nr_cached_objects) - total_objects = sb->s_op->nr_cached_objects(sb, - sc->nid); + total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 19cbda196369..494c6dfee2dd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1531,7 +1531,7 @@ xfs_fs_mount( static long xfs_fs_nr_cached_objects( struct super_block *sb, - int nid) + struct shrink_control *sc) { return xfs_reclaim_inodes_count(XFS_M(sb)); } @@ -1539,10 +1539,9 @@ xfs_fs_nr_cached_objects( static long xfs_fs_free_cached_objects( struct super_block *sb, - long nr_to_scan, - int nid) + struct shrink_control *sc) { - return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); } static const struct super_operations xfs_super_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 60acab209701..4cd648f51f63 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1616,8 +1616,10 @@ struct super_operations { struct dquot **(*get_dquots)(struct inode *); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); - long (*nr_cached_objects)(struct super_block *, int); - long (*free_cached_objects)(struct super_block *, long, int); + long (*nr_cached_objects)(struct super_block *, + struct shrink_control *); + long (*free_cached_objects)(struct super_block *, + struct shrink_control *); }; /* -- cgit v1.2.3 From 202bbe92a605a8314abb27d969943336e50cf6f2 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:42 +1100 Subject: vmscan: per memory cgroup slab shrinkers This patch adds SHRINKER_MEMCG_AWARE flag. If a shrinker has this flag set, it will be called per memory cgroup. The memory cgroup to scan objects from is passed in shrink_control->memcg. If the memory cgroup is NULL, a memcg aware shrinker is supposed to scan objects from the global list. Unaware shrinkers are only called on global pressure with memcg=NULL. Signed-off-by: Vladimir Davydov Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- fs/drop_caches.c | 14 -------- include/linux/memcontrol.h | 7 ++++ include/linux/mm.h | 5 ++- include/linux/shrinker.h | 6 +++- mm/memcontrol.c | 2 +- mm/memory-failure.c | 11 ++---- mm/vmscan.c | 86 +++++++++++++++++++++++++++++++++------------- 7 files changed, 80 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 2bc2c87f35e7..5718cb9f7273 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -static void drop_slab(void) -{ - int nr_objects; - - do { - int nid; - - nr_objects = 0; - for_each_online_node(nid) - nr_objects += shrink_node_slabs(GFP_KERNEL, nid, - 1000, 1000); - } while (nr_objects > 10); -} - int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 76b4084b8d08..d555d6533bd0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -375,6 +375,8 @@ static inline bool memcg_kmem_enabled(void) return static_key_false(&memcg_kmem_enabled_key); } +bool memcg_kmem_is_active(struct mem_cgroup *memcg); + /* * In general, we'll do everything in our power to not incur in any overhead * for non-memcg users for the kmem functions. Not even a function call, if we @@ -504,6 +506,11 @@ static inline bool memcg_kmem_enabled(void) return false; } +static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return false; +} + static inline bool memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f189197363d2..a229a17fb79c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2122,9 +2122,8 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif -unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, - unsigned long nr_scanned, - unsigned long nr_eligible); +void drop_slab(void); +void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index f4aee75f00b1..4fcacd915d45 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -20,6 +20,9 @@ struct shrink_control { /* current node being shrunk (for NUMA aware shrinkers) */ int nid; + + /* current memcg being shrunk (for memcg aware shrinkers) */ + struct mem_cgroup *memcg; }; #define SHRINK_STOP (~0UL) @@ -61,7 +64,8 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 0) +#define SHRINKER_MEMCG_AWARE (1 << 1) extern int register_shrinker(struct shrinker *); extern void unregister_shrinker(struct shrinker *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6140c0764f4..c63066d0ae12 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -365,7 +365,7 @@ struct mem_cgroup { }; #ifdef CONFIG_MEMCG_KMEM -static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +bool memcg_kmem_is_active(struct mem_cgroup *memcg) { return memcg->kmemcg_id >= 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index feb803bf3443..1a735fad2a13 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -242,15 +242,8 @@ void shake_page(struct page *p, int access) * Only call shrink_node_slabs here (which would also shrink * other caches) if access is not potentially fatal. */ - if (access) { - int nr; - int nid = page_to_nid(p); - do { - nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); - if (page_count(p) == 1) - break; - } while (nr > 10); - } + if (access) + drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); diff --git a/mm/vmscan.c b/mm/vmscan.c index f756a202d5d5..1525d24811d8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -229,10 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 -static unsigned long shrink_slabs(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -341,9 +341,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, } /** - * shrink_node_slabs - shrink slab caches of a given node + * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target * @nr_scanned: pressure numerator * @nr_eligible: pressure denominator * @@ -352,6 +353,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * + * @memcg specifies the memory cgroup to target. If it is not NULL, + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan + * objects from the memory cgroup specified. Otherwise all shrinkers + * are called, and memcg aware shrinkers are supposed to scan the + * global list then. + * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example * passes the number of pages scanned and the number of pages on the @@ -362,13 +369,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, * * Returns the number of reclaimed slab objects. */ -unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, - unsigned long nr_scanned, - unsigned long nr_eligible) +static unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; + if (memcg && !memcg_kmem_is_active(memcg)) + return 0; + if (nr_scanned == 0) nr_scanned = SWAP_CLUSTER_MAX; @@ -387,12 +398,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, + .memcg = memcg, }; + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + continue; + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); } up_read(&shrinker_rwsem); @@ -401,6 +416,29 @@ out: return freed; } +void drop_slab_node(int nid) +{ + unsigned long freed; + + do { + struct mem_cgroup *memcg = NULL; + + freed = 0; + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, + 1000, 1000); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } while (freed > 10); +} + +void drop_slab(void) +{ + int nid; + + for_each_online_node(nid) + drop_slab_node(nid); +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -2269,6 +2307,7 @@ static inline bool should_continue_reclaim(struct zone *zone, static bool shrink_zone(struct zone *zone, struct scan_control *sc, bool is_classzone) { + struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; @@ -2286,16 +2325,22 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { - unsigned long lru_pages; + unsigned long lru_pages, scanned; struct lruvec *lruvec; int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); + scanned = sc->nr_scanned; shrink_lruvec(lruvec, swappiness, sc, &lru_pages); zone_lru_pages += lru_pages; + if (memcg && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), + memcg, sc->nr_scanned - scanned, + lru_pages); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2318,19 +2363,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) { - struct reclaim_state *reclaim_state; - - shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), - sc->nr_scanned - nr_scanned, - zone_lru_pages); - - reclaim_state = current->reclaim_state; - if (reclaim_state) { - sc->nr_reclaimed += - reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } + if (global_reclaim(sc) && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, -- cgit v1.2.3 From 61d9b27420d09bfe2a686fb8d2e27474d17a1981 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:42 +1100 Subject: memcg: rename some cache id related variables memcg_limited_groups_array_size, which defines the size of memcg_caches arrays, sounds rather cumbersome. Also it doesn't point anyhow that it's related to kmem/caches stuff. So let's rename it to memcg_nr_cache_ids. It's concise and points us directly to memcg_cache_id. Also, rename kmem_limited_groups to memcg_cache_ida. Signed-off-by: Vladimir Davydov Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 19 +++++++++---------- mm/slab_common.c | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d555d6533bd0..b27f183e65cd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -360,7 +360,7 @@ static inline void sock_release_memcg(struct sock *sk) #ifdef CONFIG_MEMCG_KMEM extern struct static_key memcg_kmem_enabled_key; -extern int memcg_limited_groups_array_size; +extern int memcg_nr_cache_ids; /* * Helper macro to loop through all memcg-specific caches. Callers must still @@ -368,7 +368,7 @@ extern int memcg_limited_groups_array_size; * the slab_mutex must be held when looping through those caches */ #define for_each_memcg_cache_index(_idx) \ - for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++) + for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++) static inline bool memcg_kmem_enabled(void) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c63066d0ae12..3660186eb878 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -564,12 +564,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) * memcgs, and none but the 200th is kmem-limited, we'd have to have a * 200 entry array for that. * - * The current size of the caches array is stored in - * memcg_limited_groups_array_size. It will double each time we have to - * increase it. + * The current size of the caches array is stored in memcg_nr_cache_ids. It + * will double each time we have to increase it. */ -static DEFINE_IDA(kmem_limited_groups); -int memcg_limited_groups_array_size; +static DEFINE_IDA(memcg_cache_ida); +int memcg_nr_cache_ids; /* * MIN_SIZE is different than 1, because we would like to avoid going through @@ -2547,12 +2546,12 @@ static int memcg_alloc_cache_id(void) int id, size; int err; - id = ida_simple_get(&kmem_limited_groups, + id = ida_simple_get(&memcg_cache_ida, 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); if (id < 0) return id; - if (id < memcg_limited_groups_array_size) + if (id < memcg_nr_cache_ids) return id; /* @@ -2568,7 +2567,7 @@ static int memcg_alloc_cache_id(void) err = memcg_update_all_caches(size); if (err) { - ida_simple_remove(&kmem_limited_groups, id); + ida_simple_remove(&memcg_cache_ida, id); return err; } return id; @@ -2576,7 +2575,7 @@ static int memcg_alloc_cache_id(void) static void memcg_free_cache_id(int id) { - ida_simple_remove(&kmem_limited_groups, id); + ida_simple_remove(&memcg_cache_ida, id); } /* @@ -2586,7 +2585,7 @@ static void memcg_free_cache_id(int id) */ void memcg_update_array_size(int num) { - memcg_limited_groups_array_size = num; + memcg_nr_cache_ids = num; } struct memcg_kmem_cache_create_work { diff --git a/mm/slab_common.c b/mm/slab_common.c index 481cf81eadc3..d6cf88c2739f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -116,7 +116,7 @@ static int memcg_alloc_cache_params(struct mem_cgroup *memcg, if (!memcg) { size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_limited_groups_array_size * sizeof(void *); + size += memcg_nr_cache_ids * sizeof(void *); } else size = sizeof(struct memcg_cache_params); @@ -154,7 +154,7 @@ static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) cur_params = s->memcg_params; memcpy(new_params->memcg_caches, cur_params->memcg_caches, - memcg_limited_groups_array_size * sizeof(void *)); + memcg_nr_cache_ids * sizeof(void *)); new_params->is_root_cache = true; -- cgit v1.2.3 From ee3d1b8b39c9897e0c07ba82468e618fa3bbc432 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:42 +1100 Subject: memcg: add rwsem to synchronize against memcg_caches arrays relocation We need a stable value of memcg_nr_cache_ids in kmem_cache_create() (memcg_alloc_cache_params() wants it for root caches), where we only hold the slab_mutex and no memcg-related locks. As a result, we have to update memcg_nr_cache_ids under the slab_mutex, which we can only take on the slab's side (see memcg_update_array_size). This looks awkward and will become even worse when per-memcg list_lru is introduced, which also wants stable access to memcg_nr_cache_ids. To get rid of this dependency between the memcg_nr_cache_ids and the slab_mutex, this patch introduces a special rwsem. The rwsem is held for writing during memcg_caches arrays relocation and memcg_nr_cache_ids updates. Therefore one can take it for reading to get a stable access to memcg_caches arrays and/or memcg_nr_cache_ids. Currently the semaphore is taken for reading only from kmem_cache_create, right before taking the slab_mutex, so right now there's no much point in using rwsem instead of mutex. However, once list_lru is made per-memcg it will allow list_lru initializations to proceed concurrently. Signed-off-by: Vladimir Davydov Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ++++++++++-- mm/memcontrol.c | 29 +++++++++++++++++++---------- mm/slab_common.c | 9 ++++----- 3 files changed, 33 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b27f183e65cd..8dafad6bb248 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -361,6 +361,8 @@ static inline void sock_release_memcg(struct sock *sk) extern struct static_key memcg_kmem_enabled_key; extern int memcg_nr_cache_ids; +extern void memcg_get_cache_ids(void); +extern void memcg_put_cache_ids(void); /* * Helper macro to loop through all memcg-specific caches. Callers must still @@ -396,8 +398,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); -void memcg_update_array_size(int num_groups); - struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); @@ -531,6 +531,14 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } +static inline void memcg_get_cache_ids(void) +{ +} + +static inline void memcg_put_cache_ids(void) +{ +} + static inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3660186eb878..6a4a9e3d5159 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -570,6 +570,19 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) static DEFINE_IDA(memcg_cache_ida); int memcg_nr_cache_ids; +/* Protects memcg_nr_cache_ids */ +static DECLARE_RWSEM(memcg_cache_ids_sem); + +void memcg_get_cache_ids(void) +{ + down_read(&memcg_cache_ids_sem); +} + +void memcg_put_cache_ids(void) +{ + up_read(&memcg_cache_ids_sem); +} + /* * MIN_SIZE is different than 1, because we would like to avoid going through * the alloc/free process all the time. In a small machine, 4 kmem-limited @@ -2558,6 +2571,7 @@ static int memcg_alloc_cache_id(void) * There's no space for the new id in memcg_caches arrays, * so we have to grow them. */ + down_write(&memcg_cache_ids_sem); size = 2 * (id + 1); if (size < MEMCG_CACHES_MIN_SIZE) @@ -2566,6 +2580,11 @@ static int memcg_alloc_cache_id(void) size = MEMCG_CACHES_MAX_SIZE; err = memcg_update_all_caches(size); + if (!err) + memcg_nr_cache_ids = size; + + up_write(&memcg_cache_ids_sem); + if (err) { ida_simple_remove(&memcg_cache_ida, id); return err; @@ -2578,16 +2597,6 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } -/* - * We should update the current array size iff all caches updates succeed. This - * can only be done from the slab side. The slab mutex needs to be held when - * calling this. - */ -void memcg_update_array_size(int num) -{ - memcg_nr_cache_ids = num; -} - struct memcg_kmem_cache_create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; diff --git a/mm/slab_common.c b/mm/slab_common.c index d6cf88c2739f..42bb22cb4219 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -169,8 +169,8 @@ int memcg_update_all_caches(int num_memcgs) { struct kmem_cache *s; int ret = 0; - mutex_lock(&slab_mutex); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { if (!is_root_cache(s)) continue; @@ -181,11 +181,8 @@ int memcg_update_all_caches(int num_memcgs) * up to this point in an updated state. */ if (ret) - goto out; + break; } - - memcg_update_array_size(num_memcgs); -out: mutex_unlock(&slab_mutex); return ret; } @@ -369,6 +366,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, get_online_cpus(); get_online_mems(); + memcg_get_cache_ids(); mutex_lock(&slab_mutex); @@ -407,6 +405,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, out_unlock: mutex_unlock(&slab_mutex); + memcg_put_cache_ids(); put_online_mems(); put_online_cpus(); -- cgit v1.2.3 From 1373c455ecb1e96eba50a0631cdb882624e6eaae Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:42 +1100 Subject: list_lru: get rid of ->active_nodes The active_nodes mask allows us to skip empty nodes when walking over list_lru items from all nodes in list_lru_count/walk. However, these functions are never called from hot paths, so it doesn't seem we need such kind of optimization there. OTOH, removing the mask will make it easier to make list_lru per-memcg. Signed-off-by: Vladimir Davydov Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 5 ++--- mm/list_lru.c | 10 +++------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index f500a2e39b13..53c1d6b78270 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -31,7 +31,6 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; - nodemask_t active_nodes; }; void list_lru_destroy(struct list_lru *lru); @@ -94,7 +93,7 @@ static inline unsigned long list_lru_count(struct list_lru *lru) long count = 0; int nid; - for_each_node_mask(nid, lru->active_nodes) + for_each_node_state(nid, N_NORMAL_MEMORY) count += list_lru_count_node(lru, nid); return count; @@ -142,7 +141,7 @@ list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, long isolated = 0; int nid; - for_each_node_mask(nid, lru->active_nodes) { + for_each_node_state(nid, N_NORMAL_MEMORY) { isolated += list_lru_walk_node(lru, nid, isolate, cb_arg, &nr_to_walk); if (nr_to_walk <= 0) diff --git a/mm/list_lru.c b/mm/list_lru.c index f1a0db194173..07e198c77888 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -19,8 +19,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) WARN_ON_ONCE(nlru->nr_items < 0); if (list_empty(item)) { list_add_tail(item, &nlru->list); - if (nlru->nr_items++ == 0) - node_set(nid, lru->active_nodes); + nlru->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -37,8 +36,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (!list_empty(item)) { list_del_init(item); - if (--nlru->nr_items == 0) - node_clear(nid, lru->active_nodes); + nlru->nr_items--; WARN_ON_ONCE(nlru->nr_items < 0); spin_unlock(&nlru->lock); return true; @@ -90,8 +88,7 @@ restart: case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); case LRU_REMOVED: - if (--nlru->nr_items == 0) - node_clear(nid, lru->active_nodes); + nlru->nr_items--; WARN_ON_ONCE(nlru->nr_items < 0); isolated++; /* @@ -133,7 +130,6 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) if (!lru->node) return -ENOMEM; - nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); if (key) -- cgit v1.2.3 From 1f9e15ec904b11b885e2e38415a6b60bde6089e6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:43 +1100 Subject: list_lru: organize all list_lrus to list To make list_lru memcg aware, we need all list_lrus to be kept on a list protected by a mutex, so that we could sleep while walking over the list. Therefore after this change list_lru_destroy may sleep. Fortunately, there is only one user that calls it from an atomic context - it's put_super - and we can easily fix it by calling list_lru_destroy before put_super in destroy_locked_super - anyway we don't longer need lrus by that time. Another point that should be noted is that list_lru_destroy is allowed to be called on an uninitialized zeroed-out object, in which case it is a no-op. Before this patch this was guaranteed by kfree, but now we need an explicit check there. Signed-off-by: Vladimir Davydov Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- fs/super.c | 8 ++++++++ include/linux/list_lru.h | 3 +++ mm/list_lru.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index a2b735a42e74..b027849d92d2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -282,6 +282,14 @@ void deactivate_locked_super(struct super_block *s) unregister_shrinker(&s->s_shrink); fs->kill_sb(s); + /* + * Since list_lru_destroy() may sleep, we cannot call it from + * put_super(), where we hold the sb_lock. Therefore we destroy + * the lru lists right now. + */ + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + put_filesystem(fs); put_super(s); } else { diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 53c1d6b78270..ee9486ac0621 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -31,6 +31,9 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; +#ifdef CONFIG_MEMCG_KMEM + struct list_head list; +#endif }; void list_lru_destroy(struct list_lru *lru); diff --git a/mm/list_lru.c b/mm/list_lru.c index 07e198c77888..a9021cb3ccde 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -9,6 +9,34 @@ #include #include #include +#include + +#ifdef CONFIG_MEMCG_KMEM +static LIST_HEAD(list_lrus); +static DEFINE_MUTEX(list_lrus_mutex); + +static void list_lru_register(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_add(&lru->list, &list_lrus); + mutex_unlock(&list_lrus_mutex); +} + +static void list_lru_unregister(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_del(&lru->list); + mutex_unlock(&list_lrus_mutex); +} +#else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -137,12 +165,18 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) INIT_LIST_HEAD(&lru->node[i].list); lru->node[i].nr_items = 0; } + list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(list_lru_init_key); void list_lru_destroy(struct list_lru *lru) { + /* Already destroyed or not yet initialized? */ + if (!lru->node) + return; + list_lru_unregister(lru); kfree(lru->node); + lru->node = NULL; } EXPORT_SYMBOL_GPL(list_lru_destroy); -- cgit v1.2.3 From 43750f791d99e9003cc9e2177694b518729f98eb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:43 +1100 Subject: list_lru: introduce per-memcg lists There are several FS shrinkers, including super_block::s_shrink, that keep reclaimable objects in the list_lru structure. Hence to turn them to memcg-aware shrinkers, it is enough to make list_lru per-memcg. This patch does the trick. It adds an array of lru lists to the list_lru_node structure (per-node part of the list_lru), one for each kmem-active memcg, and dispatches every item addition or removal to the list corresponding to the memcg which the item is accounted to. So now the list_lru structure is not just per node, but per node and per memcg. Not all list_lrus need this feature, so this patch also adds a new method, list_lru_init_memcg, which initializes a list_lru as memcg aware. Otherwise (i.e. if initialized with old list_lru_init), the list_lru won't have per memcg lists. Just like per memcg caches arrays, the arrays of per-memcg lists are indexed by memcg_cache_id, so we must grow them whenever memcg_nr_cache_ids is increased. So we introduce a callback, memcg_update_all_list_lrus, invoked by memcg_alloc_cache_id if the id space is full. The locking is implemented in a manner similar to lruvecs, i.e. we have one lock per node that protects all lists (both global and per cgroup) on the node. Signed-off-by: Vladimir Davydov Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Glauber Costa Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 52 +++++-- include/linux/memcontrol.h | 14 ++ mm/list_lru.c | 374 ++++++++++++++++++++++++++++++++++++++++++--- mm/memcontrol.c | 20 +++ 4 files changed, 424 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index ee9486ac0621..305b598abac2 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -11,6 +11,8 @@ #include #include +struct mem_cgroup; + /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ @@ -22,11 +24,26 @@ enum lru_status { internally, but has to return locked. */ }; -struct list_lru_node { - spinlock_t lock; +struct list_lru_one { struct list_head list; /* kept as signed so we can catch imbalance bugs */ long nr_items; +}; + +struct list_lru_memcg { + /* array of per cgroup lists, indexed by memcg_cache_id */ + struct list_lru_one *lru[0]; +}; + +struct list_lru_node { + /* protects all lists on the node, including per cgroup */ + spinlock_t lock; + /* global list, used for the root cgroup in cgroup aware lrus */ + struct list_lru_one lru; +#ifdef CONFIG_MEMCG_KMEM + /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ + struct list_lru_memcg *memcg_lrus; +#endif } ____cacheline_aligned_in_smp; struct list_lru { @@ -37,11 +54,14 @@ struct list_lru { }; void list_lru_destroy(struct list_lru *lru); -int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); -static inline int list_lru_init(struct list_lru *lru) -{ - return list_lru_init_key(lru, NULL); -} +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key); + +#define list_lru_init(lru) __list_lru_init((lru), false, NULL) +#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key)) +#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL) + +int memcg_update_all_list_lrus(int num_memcgs); /** * list_lru_add: add an element to the lru list's tail @@ -75,20 +95,23 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); bool list_lru_del(struct list_lru *lru, struct list_head *item); /** - * list_lru_count_node: return the number of objects currently held by @lru + * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. * @nid: the node id to count from. + * @memcg: the cgroup to count from. * * Always return a non-negative number, 0 for empty lists. There is no * guarantee that the list is not updated while the count is being computed. * Callers that want such a guarantee need to provide an outer lock. */ +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg); unsigned long list_lru_count_node(struct list_lru *lru, int nid); static inline unsigned long list_lru_shrink_count(struct list_lru *lru, struct shrink_control *sc) { - return list_lru_count_node(lru, sc->nid); + return list_lru_count_one(lru, sc->nid, sc->memcg); } static inline unsigned long list_lru_count(struct list_lru *lru) @@ -105,9 +128,10 @@ static inline unsigned long list_lru_count(struct list_lru *lru) typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg); /** - * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. + * @memcg: the cgroup to scan from. * @isolate: callback function that is resposible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate @@ -125,6 +149,10 @@ typedef enum lru_status * * Return value: the number of objects effectively removed from the LRU. */ +unsigned long list_lru_walk_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); @@ -133,8 +161,8 @@ static inline unsigned long list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { - return list_lru_walk_node(lru, sc->nid, isolate, cb_arg, - &sc->nr_to_scan); + return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, + &sc->nr_to_scan); } static inline unsigned long diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8dafad6bb248..22bb13afa399 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -401,6 +401,8 @@ int memcg_cache_id(struct mem_cgroup *memcg); struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); +struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); + int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); @@ -497,6 +499,13 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } + +static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + if (!memcg_kmem_enabled()) + return NULL; + return __mem_cgroup_from_kmem(ptr); +} #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -548,6 +557,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } + +static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + return NULL; +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/list_lru.c b/mm/list_lru.c index a9021cb3ccde..79aee70c3b9d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); @@ -38,16 +39,71 @@ static void list_lru_unregister(struct list_lru *lru) } #endif /* CONFIG_MEMCG_KMEM */ +#ifdef CONFIG_MEMCG_KMEM +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return !!lru->node[0].memcg_lrus; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + /* + * The lock protects the array of per cgroup lists from relocation + * (see memcg_update_list_lru_node). + */ + lockdep_assert_held(&nlru->lock); + if (nlru->memcg_lrus && idx >= 0) + return nlru->memcg_lrus->lru[idx]; + + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + struct mem_cgroup *memcg; + + if (!nlru->memcg_lrus) + return &nlru->lru; + + memcg = mem_cgroup_from_kmem(ptr); + if (!memcg) + return &nlru->lru; + + return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +} +#else +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return false; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + return &nlru->lru; +} +#endif /* CONFIG_MEMCG_KMEM */ + bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; spin_lock(&nlru->lock); - WARN_ON_ONCE(nlru->nr_items < 0); + l = list_lru_from_kmem(nlru, item); + WARN_ON_ONCE(l->nr_items < 0); if (list_empty(item)) { - list_add_tail(item, &nlru->list); - nlru->nr_items++; + list_add_tail(item, &l->list); + l->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -60,12 +116,14 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; spin_lock(&nlru->lock); + l = list_lru_from_kmem(nlru, item); if (!list_empty(item)) { list_del_init(item); - nlru->nr_items--; - WARN_ON_ONCE(nlru->nr_items < 0); + l->nr_items--; + WARN_ON_ONCE(l->nr_items < 0); spin_unlock(&nlru->lock); return true; } @@ -74,33 +132,58 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); -unsigned long -list_lru_count_node(struct list_lru *lru, int nid) +static unsigned long __list_lru_count_one(struct list_lru *lru, + int nid, int memcg_idx) { - unsigned long count = 0; struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + unsigned long count; spin_lock(&nlru->lock); - WARN_ON_ONCE(nlru->nr_items < 0); - count += nlru->nr_items; + l = list_lru_from_memcg_idx(nlru, memcg_idx); + WARN_ON_ONCE(l->nr_items < 0); + count = l->nr_items; spin_unlock(&nlru->lock); return count; } + +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) +{ + return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); +} +EXPORT_SYMBOL_GPL(list_lru_count_one); + +unsigned long list_lru_count_node(struct list_lru *lru, int nid) +{ + long count = 0; + int memcg_idx; + + count += __list_lru_count_one(lru, nid, -1); + if (list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) + count += __list_lru_count_one(lru, nid, memcg_idx); + } + return count; +} EXPORT_SYMBOL_GPL(list_lru_count_node); -unsigned long -list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, - void *cb_arg, unsigned long *nr_to_walk) +static unsigned long +__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; spin_lock(&nlru->lock); + l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: - list_for_each_safe(item, n, &nlru->list) { + list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* @@ -116,8 +199,8 @@ restart: case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); case LRU_REMOVED: - nlru->nr_items--; - WARN_ON_ONCE(nlru->nr_items < 0); + l->nr_items--; + WARN_ON_ONCE(l->nr_items < 0); isolated++; /* * If the lru lock has been dropped, our list @@ -128,7 +211,7 @@ restart: goto restart; break; case LRU_ROTATE: - list_move_tail(item, &nlru->list); + list_move_tail(item, &l->list); break; case LRU_SKIP: break; @@ -147,36 +230,279 @@ restart: spin_unlock(&nlru->lock); return isolated; } + +unsigned long +list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), + isolate, cb_arg, nr_to_walk); +} +EXPORT_SYMBOL_GPL(list_lru_walk_one); + +unsigned long list_lru_walk_node(struct list_lru *lru, int nid, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + long isolated = 0; + int memcg_idx; + + isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, + nr_to_walk); + if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) { + isolated += __list_lru_walk_one(lru, nid, memcg_idx, + isolate, cb_arg, nr_to_walk); + if (*nr_to_walk <= 0) + break; + } + } + return isolated; +} EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) +static void init_one_lru(struct list_lru_one *l) +{ + INIT_LIST_HEAD(&l->list); + l->nr_items = 0; +} + +#ifdef CONFIG_MEMCG_KMEM +static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) + kfree(memcg_lrus->lru[i]); +} + +static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) { + struct list_lru_one *l; + + l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); + if (!l) + goto fail; + + init_one_lru(l); + memcg_lrus->lru[i] = l; + } + return 0; +fail: + __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); + return -ENOMEM; +} + +static int memcg_init_list_lru_node(struct list_lru_node *nlru) +{ + int size = memcg_nr_cache_ids; + + nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); + if (!nlru->memcg_lrus) + return -ENOMEM; + + if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { + kfree(nlru->memcg_lrus); + return -ENOMEM; + } + + return 0; +} + +static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) +{ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); + kfree(nlru->memcg_lrus); +} + +static int memcg_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + struct list_lru_memcg *old, *new; + + BUG_ON(old_size > new_size); + + old = nlru->memcg_lrus; + new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); + if (!new) + return -ENOMEM; + + if (__memcg_init_list_lru_node(new, old_size, new_size)) { + kfree(new); + return -ENOMEM; + } + + memcpy(new, old, old_size * sizeof(void *)); + + /* + * The lock guarantees that we won't race with a reader + * (see list_lru_from_memcg_idx). + * + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + nlru->memcg_lrus = new; + spin_unlock_irq(&nlru->lock); + + kfree(old); + return 0; +} + +static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + /* do not bother shrinking the array back to the old size, because we + * cannot handle allocation failures here */ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); +} + +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) { + if (!memcg_aware) + lru->node[i].memcg_lrus = NULL; + else if (memcg_init_list_lru_node(&lru->node[i])) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_destroy_list_lru_node(&lru->node[i]); + return -ENOMEM; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_destroy_list_lru_node(&lru->node[i]); +} + +static int memcg_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return 0; + + for (i = 0; i < nr_node_ids; i++) { + if (memcg_update_list_lru_node(&lru->node[i], + old_size, new_size)) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); + return -ENOMEM; +} + +static void memcg_cancel_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); +} + +int memcg_update_all_list_lrus(int new_size) +{ + int ret = 0; + struct list_lru *lru; + int old_size = memcg_nr_cache_ids; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) { + ret = memcg_update_list_lru(lru, old_size, new_size); + if (ret) + goto fail; + } +out: + mutex_unlock(&list_lrus_mutex); + return ret; +fail: + list_for_each_entry_continue_reverse(lru, &list_lrus, list) + memcg_cancel_update_list_lru(lru, old_size, new_size); + goto out; +} +#else +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + return 0; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; + int err = -ENOMEM; + + memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); if (!lru->node) - return -ENOMEM; + goto out; for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); - INIT_LIST_HEAD(&lru->node[i].list); - lru->node[i].nr_items = 0; + init_one_lru(&lru->node[i].lru); + } + + err = memcg_init_list_lru(lru, memcg_aware); + if (err) { + kfree(lru->node); + goto out; } + list_lru_register(lru); - return 0; +out: + memcg_put_cache_ids(); + return err; } -EXPORT_SYMBOL_GPL(list_lru_init_key); +EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; + + memcg_get_cache_ids(); + list_lru_unregister(lru); + + memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; + + memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a4a9e3d5159..f3fdbbb92a31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2580,6 +2580,8 @@ static int memcg_alloc_cache_id(void) size = MEMCG_CACHES_MAX_SIZE; err = memcg_update_all_caches(size); + if (!err) + err = memcg_update_all_list_lrus(size); if (!err) memcg_nr_cache_ids = size; @@ -2774,6 +2776,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, 1 << order); page->mem_cgroup = NULL; } + +struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) +{ + struct mem_cgroup *memcg = NULL; + struct kmem_cache *cachep; + struct page *page; + + page = virt_to_head_page(ptr); + if (PageSlab(page)) { + cachep = page->slab_cache; + if (!is_root_cache(cachep)) + memcg = cachep->memcg_params->memcg; + } else + /* page allocated by alloc_kmem_pages */ + memcg = page->mem_cgroup; + + return memcg; +} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From a2f5e6e536fe625f97849804c7fad09bd9f8bbba Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:44 +1100 Subject: vmscan: force scan offline memory cgroups Since commit b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") pages charged to a memory cgroup are not reparented when the cgroup is removed. Instead, they are supposed to be reclaimed in a regular way, along with pages accounted to online memory cgroups. However, an lruvec of an offline memory cgroup will sooner or later get so small that it will be scanned only at low scan priorities (see get_scan_count()). Therefore, if there are enough reclaimable pages in big lruvecs, pages accounted to offline memory cgroups will never be scanned at all, wasting memory. Fix this by unconditionally forcing scanning dead lruvecs from kswapd. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 14 ++++++++++++++ mm/vmscan.c | 8 ++++++-- 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 22bb13afa399..b73ba82cfd64 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -102,6 +102,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); * For memory reclaim. */ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); +bool mem_cgroup_lruvec_online(struct lruvec *lruvec); int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); @@ -266,6 +267,11 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return 1; } +bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + return true; +} + static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3fdbbb92a31..ae48c9fa4636 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1379,6 +1379,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } +bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return true; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + memcg = mz->memcg; + + return !!(memcg->css.flags & CSS_ONLINE); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1525d24811d8..b89097185f46 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1941,8 +1941,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && !zone_reclaimable(zone)) - force_scan = true; + if (current_is_kswapd()) { + if (!zone_reclaimable(zone)) + force_scan = true; + if (!mem_cgroup_lruvec_online(lruvec)) + force_scan = true; + } if (!global_reclaim(sc)) force_scan = true; -- cgit v1.2.3 From ea62744f0e76f23387d83b33fe471d571b1f861a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 30 Jan 2015 13:11:45 +1100 Subject: vmscan-force-scan-offline-memory-cgroups-fix thou shalt memorize Documentation/SubmitChecklist Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b73ba82cfd64..76f489fad640 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -267,7 +267,7 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return 1; } -bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) { return true; } -- cgit v1.2.3 From 8cd02fa863a619d9fe6fb851595620e2c8286cbd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Jan 2015 13:11:46 +1100 Subject: mm: page_counter: pull "-1" handling out of page_counter_memparse() The unified hierarchy interface for memory cgroups will no longer use "-1" to mean maximum possible resource value. In preparation for this, make the string an argument and let the caller supply it. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Greg Thelen Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 3 ++- mm/hugetlb_cgroup.c | 2 +- mm/memcontrol.c | 4 ++-- mm/page_counter.c | 7 ++++--- net/ipv4/tcp_memcontrol.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 955421575d16..17fa4f8de3a6 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -41,7 +41,8 @@ int page_counter_try_charge(struct page_counter *counter, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_limit(struct page_counter *counter, unsigned long limit); -int page_counter_memparse(const char *buf, unsigned long *nr_pages); +int page_counter_memparse(const char *buf, const char *max, + unsigned long *nr_pages); static inline void page_counter_reset_watermark(struct page_counter *counter) { diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 037e1c00a5b7..6e0057439a46 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return -EINVAL; buf = strstrip(buf); - ret = page_counter_memparse(buf, &nr_pages); + ret = page_counter_memparse(buf, "-1", &nr_pages); if (ret) return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 12202c8e0f95..707ae130928b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3442,7 +3442,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, int ret; buf = strstrip(buf); - ret = page_counter_memparse(buf, &nr_pages); + ret = page_counter_memparse(buf, "-1", &nr_pages); if (ret) return ret; @@ -3814,7 +3814,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, unsigned long usage; int i, size, ret; - ret = page_counter_memparse(args, &threshold); + ret = page_counter_memparse(args, "-1", &threshold); if (ret) return ret; diff --git a/mm/page_counter.c b/mm/page_counter.c index a009574fbba9..11b4beda14ba 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse + * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ -int page_counter_memparse(const char *buf, unsigned long *nr_pages) +int page_counter_memparse(const char *buf, const char *max, + unsigned long *nr_pages) { - char unlimited[] = "-1"; char *end; u64 bytes; - if (!strncmp(buf, unlimited, sizeof(unlimited))) { + if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 272327134a1b..c2a75c6957a1 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = page_counter_memparse(buf, &nr_pages); + ret = page_counter_memparse(buf, "-1", &nr_pages); if (ret) break; mutex_lock(&tcp_limit_mutex); -- cgit v1.2.3 From f67ef343500f30f3b1d47a10baf09b95a497f51f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Jan 2015 13:11:46 +1100 Subject: mm: memcontrol: default hierarchy interface for memory Introduce the basic control files to account, partition, and limit memory using cgroups in default hierarchy mode. This interface versioning allows us to address fundamental design issues in the existing memory cgroup interface, further explained below. The old interface will be maintained indefinitely, but a clearer model and improved workload performance should encourage existing users to switch over to the new one eventually. The control files are thus: - memory.current shows the current consumption of the cgroup and its descendants, in bytes. - memory.low configures the lower end of the cgroup's expected memory consumption range. The kernel considers memory below that boundary to be a reserve - the minimum that the workload needs in order to make forward progress - and generally avoids reclaiming it, unless there is an imminent risk of entering an OOM situation. - memory.high configures the upper end of the cgroup's expected memory consumption range. A cgroup whose consumption grows beyond this threshold is forced into direct reclaim, to work off the excess and to throttle new allocations heavily, but is generally allowed to continue and the OOM killer is not invoked. - memory.max configures the hard maximum amount of memory that the cgroup is allowed to consume before the OOM killer is invoked. - memory.events shows event counters that indicate how often the cgroup was reclaimed while below memory.low, how often it was forced to reclaim excess beyond memory.high, how often it hit memory.max, and how often it entered OOM due to memory.max. This allows users to identify configuration problems when observing a degradation in workload performance. An overcommitted system will have an increased rate of low boundary breaches, whereas increased rates of high limit breaches, maximum hits, or even OOM situations will indicate internally overcommitted cgroups. For existing users of memory cgroups, the following deviations from the current interface are worth pointing out and explaining: - The original lower boundary, the soft limit, is defined as a limit that is per default unset. As a result, the set of cgroups that global reclaim prefers is opt-in, rather than opt-out. The costs for optimizing these mostly negative lookups are so high that the implementation, despite its enormous size, does not even provide the basic desirable behavior. First off, the soft limit has no hierarchical meaning. All configured groups are organized in a global rbtree and treated like equal peers, regardless where they are located in the hierarchy. This makes subtree delegation impossible. Second, the soft limit reclaim pass is so aggressive that it not just introduces high allocation latencies into the system, but also impacts system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated reserve. A cgroup enjoys reclaim protection when it and all its ancestors are below their low boundaries, which makes delegation of subtrees possible. Secondly, new cgroups have no reserve per default and in the common case most cgroups are eligible for the preferred reclaim pass. This allows the new low boundary to be efficiently implemented with just a minor addition to the generic reclaim code, without the need for out-of-band data structures and reclaim passes. Because the generic reclaim code considers all cgroups except for the ones running low in the preferred first reclaim pass, overreclaim of individual groups is eliminated as well, resulting in much better overall workload performance. - The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. But this generally goes against the goal of making the most out of the available memory. The memory consumption of workloads varies during runtime, and that requires users to overcommit. But doing that with a strict upper limit requires either a fairly accurate prediction of the working set size or adding slack to the limit. Since working set size estimation is hard and error prone, and getting it wrong results in OOM kills, most users tend to err on the side of a looser limit and end up wasting precious resources. The memory.high boundary on the other hand can be set much more conservatively. When hit, it throttles allocations by forcing them into direct reclaim to work off the excess, but it never invokes the OOM killer. As a result, a high boundary that is chosen too aggressively will not terminate the processes, but instead it will lead to gradual performance degradation. The user can monitor this and make corrections until the minimal memory footprint that still gives acceptable performance is found. In extreme cases, with many concurrent allocations and a complete breakdown of reclaim progress within the group, the high boundary can be exceeded. But even then it's mostly better to satisfy the allocation from the slack available in other groups or the rest of the system than killing the group. Otherwise, memory.max is there to limit this type of spillover and ultimately contain buggy or even malicious applications. - The original control file names are unwieldy and inconsistent in many different ways. For example, the upper boundary hit count is exported in the memory.failcnt file, but an OOM event count has to be manually counted by listening to memory.oom_control events, and lower boundary / soft limit events have to be counted by first setting a threshold for that value and then counting those events. Also, usage and limit files encode their units in the filename. That makes the filenames very long, even though this is not information that a user needs to be reminded of every time they type out those names. To address these naming issues, as well as to signal clearly that the new interface carries a new configuration model, the naming conventions in it necessarily differ from the old interface. - The original limit files indicate the state of an unset limit with a very high number, and a configured limit can be unset by echoing -1 into those files. But that very high number is implementation and architecture dependent and not very descriptive. And while -1 can be understood as an underflow into the highest possible value, -2 or -10M etc. do not work, so it's not inconsistent. memory.low, memory.high, and memory.max will use the string "infinity" to indicate and set the highest possible value. [akpm@linux-foundation.org: use seq_puts() for basic strings] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Greg Thelen Signed-off-by: Andrew Morton --- Documentation/cgroups/unified-hierarchy.txt | 79 ++++++++++ include/linux/memcontrol.h | 32 ++++ mm/memcontrol.c | 229 ++++++++++++++++++++++++++-- mm/vmscan.c | 22 ++- 4 files changed, 348 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 4f4563277864..71daa35ec2d9 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -327,6 +327,85 @@ supported and the interface files "release_agent" and - use_hierarchy is on by default and the cgroup file for the flag is not created. +- The original lower boundary, the soft limit, is defined as a limit + that is per default unset. As a result, the set of cgroups that + global reclaim prefers is opt-in, rather than opt-out. The costs + for optimizing these mostly negative lookups are so high that the + implementation, despite its enormous size, does not even provide the + basic desirable behavior. First off, the soft limit has no + hierarchical meaning. All configured groups are organized in a + global rbtree and treated like equal peers, regardless where they + are located in the hierarchy. This makes subtree delegation + impossible. Second, the soft limit reclaim pass is so aggressive + that it not just introduces high allocation latencies into the + system, but also impacts system performance due to overreclaim, to + the point where the feature becomes self-defeating. + + The memory.low boundary on the other hand is a top-down allocated + reserve. A cgroup enjoys reclaim protection when it and all its + ancestors are below their low boundaries, which makes delegation of + subtrees possible. Secondly, new cgroups have no reserve per + default and in the common case most cgroups are eligible for the + preferred reclaim pass. This allows the new low boundary to be + efficiently implemented with just a minor addition to the generic + reclaim code, without the need for out-of-band data structures and + reclaim passes. Because the generic reclaim code considers all + cgroups except for the ones running low in the preferred first + reclaim pass, overreclaim of individual groups is eliminated as + well, resulting in much better overall workload performance. + +- The original high boundary, the hard limit, is defined as a strict + limit that can not budge, even if the OOM killer has to be called. + But this generally goes against the goal of making the most out of + the available memory. The memory consumption of workloads varies + during runtime, and that requires users to overcommit. But doing + that with a strict upper limit requires either a fairly accurate + prediction of the working set size or adding slack to the limit. + Since working set size estimation is hard and error prone, and + getting it wrong results in OOM kills, most users tend to err on the + side of a looser limit and end up wasting precious resources. + + The memory.high boundary on the other hand can be set much more + conservatively. When hit, it throttles allocations by forcing them + into direct reclaim to work off the excess, but it never invokes the + OOM killer. As a result, a high boundary that is chosen too + aggressively will not terminate the processes, but instead it will + lead to gradual performance degradation. The user can monitor this + and make corrections until the minimal memory footprint that still + gives acceptable performance is found. + + In extreme cases, with many concurrent allocations and a complete + breakdown of reclaim progress within the group, the high boundary + can be exceeded. But even then it's mostly better to satisfy the + allocation from the slack available in other groups or the rest of + the system than killing the group. Otherwise, memory.max is there + to limit this type of spillover and ultimately contain buggy or even + malicious applications. + +- The original control file names are unwieldy and inconsistent in + many different ways. For example, the upper boundary hit count is + exported in the memory.failcnt file, but an OOM event count has to + be manually counted by listening to memory.oom_control events, and + lower boundary / soft limit events have to be counted by first + setting a threshold for that value and then counting those events. + Also, usage and limit files encode their units in the filename. + That makes the filenames very long, even though this is not + information that a user needs to be reminded of every time they type + out those names. + + To address these naming issues, as well as to signal clearly that + the new interface carries a new configuration model, the naming + conventions in it necessarily differ from the old interface. + +- The original limit files indicate the state of an unset limit with a + Very High Number, and a configured limit can be unset by echoing -1 + into those files. But that very high number is implementation and + architecture dependent and not very descriptive. And while -1 can + be understood as an underflow into the highest possible value, -2 or + -10M etc. do not work, so it's not consistent. + + memory.low, memory.high, and memory.max will use the string + "infinity" to indicate and set the highest possible value. 5. Planned Changes diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 76f489fad640..72dff5fb0d0c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,7 +52,27 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ + MEM_CGROUP_EVENTS_NSTATS, + /* default hierarchy events */ + MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS, + MEMCG_HIGH, + MEMCG_MAX, + MEMCG_OOM, + MEMCG_NR_EVENTS, +}; + #ifdef CONFIG_MEMCG +void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr); + +bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); + int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -175,6 +195,18 @@ void mem_cgroup_split_huge_fixup(struct page *head); #else /* CONFIG_MEMCG */ struct mem_cgroup; +static inline void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr) +{ +} + +static inline bool mem_cgroup_low(struct mem_cgroup *root, + struct mem_cgroup *memcg) +{ + return false; +} + static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 707ae130928b..d17f390ee106 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -97,14 +97,6 @@ static const char * const mem_cgroup_stat_names[] = { "swap", }; -enum mem_cgroup_events_index { - MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ - MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ - MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ - MEM_CGROUP_EVENTS_NSTATS, -}; - static const char * const mem_cgroup_events_names[] = { "pgpgin", "pgpgout", @@ -138,7 +130,7 @@ enum mem_cgroup_events_target { struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; - unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long events[MEMCG_NR_EVENTS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -284,6 +276,10 @@ struct mem_cgroup { struct page_counter memsw; struct page_counter kmem; + /* Normal memory consumption range */ + unsigned long low; + unsigned long high; + unsigned long soft_limit; /* vmpressure notifications */ @@ -2327,6 +2323,8 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; + mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -2368,6 +2366,8 @@ retry: if (fatal_signal_pending(current)) goto bypass; + mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2379,6 +2379,16 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + /* + * If the hierarchy is above the normal consumption range, + * make the charging task trim their excess contribution. + */ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); done: return ret; } @@ -4304,7 +4314,7 @@ out_kfree: return ret; } -static struct cftype mem_cgroup_files[] = { +static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), @@ -4580,6 +4590,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent_css == NULL) { root_mem_cgroup = memcg; page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); @@ -4625,6 +4636,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent->use_hierarchy) { page_counter_init(&memcg->memory, &parent->memory); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); @@ -4635,6 +4647,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) */ } else { page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); @@ -4710,6 +4723,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + memcg->low = 0; + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; } @@ -5295,6 +5310,147 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) mem_cgroup_from_css(root_css)->use_hierarchy = true; } +static u64 memory_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_usage(mem_cgroup_from_css(css), false); +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long low = ACCESS_ONCE(memcg->low); + + if (low == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &low); + if (err) + return err; + + memcg->low = low; + + return nbytes; +} + +static int memory_high_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long high = ACCESS_ONCE(memcg->high); + + if (high == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &high); + if (err) + return err; + + memcg->high = high; + + return nbytes; +} + +static int memory_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = ACCESS_ONCE(memcg->memory.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &max); + if (err) + return err; + + err = mem_cgroup_resize_limit(memcg, max); + if (err) + return err; + + return nbytes; +} + +static int memory_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); + + return 0; +} + +static struct cftype memory_files[] = { + { + .name = "current", + .read_u64 = memory_current_read, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_max_show, + .write = memory_max_write, + }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_events_show, + }, + { } /* terminate */ +}; + struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, @@ -5305,7 +5461,8 @@ struct cgroup_subsys memory_cgrp_subsys = { .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, - .legacy_cftypes = mem_cgroup_files, + .dfl_cftypes = memory_files, + .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, }; @@ -5340,6 +5497,56 @@ static void __init enable_swap_cgroup(void) } #endif +/** + * mem_cgroup_events - count memory events against a cgroup + * @memcg: the memory cgroup + * @idx: the event index + * @nr: the number of events to account for + */ +void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr) +{ + this_cpu_add(memcg->stat->events[idx], nr); +} + +/** + * mem_cgroup_low - check if memory consumption is below the normal range + * @root: the highest ancestor to consider + * @memcg: the memory cgroup to check + * + * Returns %true if memory consumption of @memcg, and that of all + * configurable ancestors up to @root, is below the normal range. + */ +bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return false; + + /* + * The toplevel group doesn't have a configurable range, so + * it's never low when looked at directly, and it is not + * considered an ancestor when assessing the hierarchy. + */ + + if (memcg == root_mem_cgroup) + return false; + + if (page_counter_read(&memcg->memory) > memcg->low) + return false; + + while (memcg != root) { + memcg = parent_mem_cgroup(memcg); + + if (memcg == root_mem_cgroup) + break; + + if (page_counter_read(&memcg->memory) > memcg->low) + return false; + } + return true; +} + #ifdef CONFIG_MEMCG_SWAP /** * mem_cgroup_swapout - transfer a memsw charge to swap diff --git a/mm/vmscan.c b/mm/vmscan.c index b89097185f46..f62ec654d4c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -91,6 +91,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Can cgroups be reclaimed below their normal consumption range? */ + unsigned int may_thrash:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -2333,6 +2336,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, struct lruvec *lruvec; int swappiness; + if (mem_cgroup_low(root, memcg)) { + if (!sc->may_thrash) + continue; + mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); scanned = sc->nr_scanned; @@ -2360,8 +2369,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); /* * Shrink the slab caches in the same proportion that @@ -2559,10 +2567,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { + int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; bool zones_reclaimable; - +retry: delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2612,6 +2621,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->compaction_ready) return 1; + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (!sc->may_thrash) { + sc->priority = initial_priority; + sc->may_thrash = 1; + goto retry; + } + /* Any of the zones still reclaimable? Don't OOM. */ if (zones_reclaimable) return 1; -- cgit v1.2.3 From 5440afa5595db53d18abb1d21d78e7e2a9fc1c0a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 30 Jan 2015 13:11:47 +1100 Subject: oom: add helpers for setting and clearing TIF_MEMDIE This patchset addresses a race which was described in the changelog for 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend"): : PM freezer relies on having all tasks frozen by the time devices are : getting frozen so that no task will touch them while they are getting : frozen. But OOM killer is allowed to kill an already frozen task in order : to handle OOM situtation. In order to protect from late wake ups OOM : killer is disabled after all tasks are frozen. This, however, still keeps : a window open when a killed task didn't manage to die by the time : freeze_processes finishes. The original patch hasn't closed the race window completely because that would require a more complex solution as it can be seen by this patchset. The primary motivation was to close the race condition between OOM killer and PM freezer _completely_. As Tejun pointed out, even though the race condition is unlikely the harder it would be to debug weird bugs deep in the PM freezer when the debugging options are reduced considerably. I can only speculate what might happen when a task is still runnable unexpectedly. On a plus side and as a side effect the oom enable/disable has a better (full barrier) semantic without polluting hot paths. I have tested the series in KVM with 100M RAM: - many small tasks (20M anon mmap) which are triggering OOM continually - s2ram which resumes automatically is triggered in a loop echo processors > /sys/power/pm_test while true do echo mem > /sys/power/state sleep 1s done - simple module which allocates and frees 20M in 8K chunks. If it sees freezing(current) then it tries another round of allocation before calling try_to_freeze - debugging messages of PM stages and OOM killer enable/disable/fail added and unmark_oom_victim is delayed by 1s after it clears TIF_MEMDIE and before it wakes up waiters. - rebased on top of the current mmotm which means some necessary updates in mm/oom_kill.c. mark_tsk_oom_victim is now called under task_lock but I think this should be OK because __thaw_task shouldn't interfere with any locking down wake_up_process. Oleg? As expected there are no OOM killed tasks after oom is disabled and allocations requested by the kernel thread are failing after all the tasks are frozen and OOM disabled. I wasn't able to catch a race where oom_killer_disable would really have to wait but I kinda expected the race is really unlikely. [ 242.609330] Killed process 2992 (mem_eater) total-vm:24412kB, anon-rss:2164kB, file-rss:4kB [ 243.628071] Unmarking 2992 OOM victim. oom_victims: 1 [ 243.636072] (elapsed 2.837 seconds) done. [ 243.641985] Trying to disable OOM killer [ 243.643032] Waiting for concurent OOM victims [ 243.644342] OOM killer disabled [ 243.645447] Freezing remaining freezable tasks ... (elapsed 0.005 seconds) done. [ 243.652983] Suspending console(s) (use no_console_suspend to debug) [ 243.903299] kmem_eater: page allocation failure: order:1, mode:0x204010 [...] [ 243.992600] PM: suspend of devices complete after 336.667 msecs [ 243.993264] PM: late suspend of devices complete after 0.660 msecs [ 243.994713] PM: noirq suspend of devices complete after 1.446 msecs [ 243.994717] ACPI: Preparing to enter system sleep state S3 [ 243.994795] PM: Saving platform NVS memory [ 243.994796] Disabling non-boot CPUs ... The first 2 patches are simple cleanups for OOM. They should go in regardless the rest IMO. Patches 3 and 4 are trivial printk -> pr_info conversion and they should go in ditto. The main patch is the last one and I would appreciate acks from Tejun and Rafael. I think the OOM part should be OK (except for __thaw_task vs. task_lock where a look from Oleg would appreciated) but I am not so sure I haven't screwed anything in the freezer code. I have found several surprises there. This patch (of 5): This patch is just a preparatory and it doesn't introduce any functional change. Note: I am utterly unhappy about lowmemory killer abusing TIF_MEMDIE just to wait for the oom victim and to prevent from new killing. This is just a side effect of the flag. The primary meaning is to give the oom victim access to the memory reserves and that shouldn't be necessary here. Signed-off-by: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Cong Wang Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- drivers/staging/android/lowmemorykiller.c | 7 ++++++- include/linux/oom.h | 4 ++++ kernel/exit.c | 2 +- mm/memcontrol.c | 2 +- mm/oom_kill.c | 23 ++++++++++++++++++++--- 5 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index b545d3d1da3e..feafa172b155 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -160,7 +160,12 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize); lowmem_deathpending_timeout = jiffies + HZ; - set_tsk_thread_flag(selected, TIF_MEMDIE); + /* + * FIXME: lowmemorykiller shouldn't abuse global OOM killer + * infrastructure. There is no real reason why the selected + * task should have access to the memory reserves. + */ + mark_tsk_oom_victim(selected); send_sig(SIGKILL, selected, 0); rem += selected_tasksize; } diff --git a/include/linux/oom.h b/include/linux/oom.h index 76200984d1e2..b42b80f88c3a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -47,6 +47,10 @@ static inline bool oom_task_origin(const struct task_struct *p) return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); } +extern void mark_tsk_oom_victim(struct task_struct *tsk); + +extern void unmark_oom_victim(void); + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/kernel/exit.c b/kernel/exit.c index 6806c55475ee..02b3d1ab2ec0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - clear_thread_flag(TIF_MEMDIE); + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 58091af65f0f..5e548bd85613 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1568,7 +1568,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - set_thread_flag(TIF_MEMDIE); + mark_tsk_oom_victim(current); return; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 294493a7ae4b..80b34e285f96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -416,6 +416,23 @@ void note_oom_kill(void) atomic_inc(&oom_kills); } +/** + * mark_tsk_oom_victim - marks the given taks as OOM victim. + * @tsk: task to mark + */ +void mark_tsk_oom_victim(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk, TIF_MEMDIE); +} + +/** + * unmark_oom_victim - unmarks the current task as OOM victim. + */ +void unmark_oom_victim(void) +{ + clear_thread_flag(TIF_MEMDIE); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -440,7 +457,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ task_lock(p); if (p->mm && task_will_free_mem(p)) { - set_tsk_thread_flag(p, TIF_MEMDIE); + mark_tsk_oom_victim(p); task_unlock(p); put_task_struct(p); return; @@ -495,7 +512,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; - set_tsk_thread_flag(victim, TIF_MEMDIE); + mark_tsk_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), @@ -652,7 +669,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { - set_thread_flag(TIF_MEMDIE); + mark_tsk_oom_victim(current); return; } -- cgit v1.2.3 From 74f94e731db04a57020cd23773092231a6760d22 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 30 Jan 2015 13:11:48 +1100 Subject: oom, PM: make OOM detection in the freezer path raceless 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend") has left a race window when OOM killer manages to note_oom_kill after freeze_processes checks the counter. The race window is quite small and really unlikely and partial solution deemed sufficient at the time of submission. Tejun wasn't happy about this partial solution though and insisted on a full solution. That requires the full OOM and freezer's task freezing exclusion, though. This is done by this patch which introduces oom_sem RW lock and turns oom_killer_disable() into a full OOM barrier. oom_killer_disabled check is moved from the allocation path to the OOM level and we take oom_sem for reading for both the check and the whole OOM invocation. oom_killer_disable() takes oom_sem for writing so it waits for all currently running OOM killer invocations. Then it disable all the further OOMs by setting oom_killer_disabled and checks for any oom victims. Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The last victim wakes up all waiters enqueued by oom_killer_disable(). Therefore this function acts as the full OOM barrier. The page fault path is covered now as well although it was assumed to be safe before. As per Tejun, "We used to have freezing points deep in file system code which may be reacheable from page fault." so it would be better and more robust to not rely on freezing points here. Same applies to the memcg OOM killer. out_of_memory tells the caller whether the OOM was allowed to trigger and the callers are supposed to handle the situation. The page allocation path simply fails the allocation same as before. The page fault path will retry the fault (more on that later) and Sysrq OOM trigger will simply complain to the log. Normally there wouldn't be any unfrozen user tasks after try_to_freeze_tasks so the function will not block. But if there was an OOM killer racing with try_to_freeze_tasks and the OOM victim didn't finish yet then we have to wait for it. This should complete in a finite time, though, because - the victim cannot loop in the page fault handler (it would die on the way out from the exception) - it cannot loop in the page allocator because all the further allocation would fail and __GFP_NOFAIL allocations are not acceptable at this stage - it shouldn't be blocked on any locks held by frozen tasks (try_to_freeze expects lockless context) and kernel threads and work queues are not frozen yet Signed-off-by: Michal Hocko Suggested-by: Tejun Heo Cc: David Rientjes Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Cong Wang Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- drivers/tty/sysrq.c | 5 +- include/linux/oom.h | 14 ++---- kernel/exit.c | 3 +- kernel/power/process.c | 50 ++++--------------- mm/memcontrol.c | 2 +- mm/oom_kill.c | 132 +++++++++++++++++++++++++++++++++++++++++-------- mm/page_alloc.c | 17 +------ 7 files changed, 132 insertions(+), 91 deletions(-) (limited to 'include') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 0071469ecbf1..259a4d5a4e8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, - 0, NULL, true); + if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), + GFP_KERNEL, 0, NULL, true)) + pr_info("OOM request ignored because killer is disabled\n"); } static DECLARE_WORK(moom_work, moom_callback); diff --git a/include/linux/oom.h b/include/linux/oom.h index b42b80f88c3a..d5771bed59c9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill); -extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); extern bool oom_killer_disabled; - -static inline void oom_killer_disable(void) -{ - oom_killer_disabled = true; -} - -static inline void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} +extern bool oom_killer_disable(void); +extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); diff --git a/kernel/exit.c b/kernel/exit.c index 02b3d1ab2ec0..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - unmark_oom_victim(); + if (test_thread_flag(TIF_MEMDIE)) + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/kernel/power/process.c b/kernel/power/process.c index 3ac45f192e9f..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } -static bool __check_frozen_processes(void) -{ - struct task_struct *g, *p; - - for_each_process_thread(g, p) - if (p != current && !freezer_should_skip(p) && !frozen(p)) - return false; - - return true; -} - -/* - * Returns true if all freezable tasks (except for current) are frozen already - */ -static bool check_frozen_processes(void) -{ - bool ret; - - read_lock(&tasklist_lock); - ret = __check_frozen_processes(); - read_unlock(&tasklist_lock); - return ret; -} - /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -142,7 +118,6 @@ static bool check_frozen_processes(void) int freeze_processes(void) { int error; - int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -157,29 +132,22 @@ int freeze_processes(void) pm_wakeup_clear(); pr_info("Freezing user space processes ... "); pm_freezing = true; - oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); - - /* - * There might have been an OOM kill while we were - * freezing tasks and the killed task might be still - * on the way out so we have to double check for race. - */ - if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { - __usermodehelper_set_disable_depth(UMH_ENABLED); - pr_cont("OOM in progress."); - error = -EBUSY; - } else { - pr_cont("done."); - } + pr_cont("done."); } pr_cont("\n"); BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disbale + * the OOM killer to disallow any further interference with + * killable tasks. + */ + if (!error && !oom_killer_disable()) + error = -EBUSY; + if (error) thaw_processes(); return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5e548bd85613..efe5f8e2bbd9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1942,7 +1942,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle) + if (!handle || oom_killer_disabled) goto cleanup; owait.memcg = memcg; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3cbd76b8c13b..b8df76ee2be3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } /* - * Number of OOM killer invocations (including memcg OOM killer). - * Primarily used by PM freezer to check for potential races with - * OOM killed frozen task. + * Number of OOM victims in flight */ -static atomic_t oom_kills = ATOMIC_INIT(0); +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -int oom_kills_count(void) -{ - return atomic_read(&oom_kills); -} - -void note_oom_kill(void) -{ - atomic_inc(&oom_kills); -} +bool oom_killer_disabled __read_mostly; +static DECLARE_RWSEM(oom_sem); /** * mark_tsk_oom_victim - marks the given taks as OOM victim. * @tsk: task to mark + * + * Has to be called with oom_sem taken for read and never after + * oom has been disabled already. */ void mark_tsk_oom_victim(struct task_struct *tsk) { - set_tsk_thread_flag(tsk, TIF_MEMDIE); - + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk) * that TIF_MEMDIE tasks should be ignored. */ __thaw_task(tsk); + atomic_inc(&oom_victims); } /** * unmark_oom_victim - unmarks the current task as OOM victim. + * + * Wakes up all waiters in oom_killer_disable() */ void unmark_oom_victim(void) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_thread_flag(TIF_MEMDIE)) + return; + + down_read(&oom_sem); + /* + * There is no need to signal the lasst oom_victim if there + * is nobody who cares. + */ + if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + wake_up_all(&oom_victims_wait); + up_read(&oom_sem); +} + +/** + * oom_killer_disable - disable OOM killer + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(void) +{ + /* + * Make sure to not race with an ongoing OOM killer + * and that the current is not the victim. + */ + down_write(&oom_sem); + if (test_thread_flag(TIF_MEMDIE)) { + up_write(&oom_sem); + return false; + } + + oom_killer_disabled = true; + up_write(&oom_sem); + + wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + + return true; +} + +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + down_write(&oom_sem); + oom_killer_disabled = false; + up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) } /** - * out_of_memory - kill the "best" process when we run out of memory + * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 @@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; @@ -718,6 +771,32 @@ out: schedule_timeout_killable(1); } +/** + * out_of_memory - tries to invoke OOM killer. + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() + * when it returns false. Otherwise returns true. + */ +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + bool ret = false; + + down_read(&oom_sem); + if (!oom_killer_disabled) { + __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); + ret = true; + } + up_read(&oom_sem); + + return ret; +} + /* * The pagefault handler calls here because it is out of memory, so kill a * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a @@ -727,12 +806,25 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; + down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - return; + goto unlock; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - out_of_memory(NULL, 0, 0, NULL, false); + if (!oom_killer_disabled) + __out_of_memory(NULL, 0, 0, NULL, false); + else + /* + * There shouldn't be any user tasks runable while the + * OOM killer is disabled so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } +unlock: + up_read(&oom_sem); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 641d5a9a8617..134e25525044 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } -bool oom_killer_disabled __read_mostly; - #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; - if (oom_killer_disabled) - return NULL; - /* * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. @@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } - /* - * PM-freezer should be notified that there might be an OOM killer on - * its way to kill and wake somebody up. This is too early and we might - * end up not killing anything but false positives are acceptable. - * See freeze_processes. - */ - note_oom_kill(); - /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if @@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false); - *did_some_progress = 1; + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; -- cgit v1.2.3 From 6653b2f1c3eb08a05ab2dddaf42a12a0dcd78e5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:50 +1100 Subject: mm, asm-generic: define PUD_SHIFT in If an architecure uses , build fails if we try to use PUD_SHIFT in generic code: In file included from arch/microblaze/include/asm/bug.h:1:0, from include/linux/bug.h:4, from include/linux/thread_info.h:11, from include/asm-generic/preempt.h:4, from arch/microblaze/include/generated/asm/preempt.h:1, from include/linux/preempt.h:18, from include/linux/spinlock.h:50, from include/linux/mmzone.h:7, from include/linux/gfp.h:5, from include/linux/slab.h:14, from mm/mmap.c:12: mm/mmap.c: In function 'exit_mmap': >> mm/mmap.c:2858:46: error: 'PUD_SHIFT' undeclared (first use in this function) round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); ^ include/asm-generic/bug.h:86:25: note: in definition of macro 'WARN_ON' int __ret_warn_on = !!(condition); \ ^ mm/mmap.c:2858:46: note: each undeclared identifier is reported only once for each function it appears in round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); ^ include/asm-generic/bug.h:86:25: note: in definition of macro 'WARN_ON' int __ret_warn_on = !!(condition); \ ^ As with , let's define PUD_SHIFT to PGDIR_SHIFT. Signed-off-by: Kirill A. Shutemov Reported-by: Wu Fengguang Signed-off-by: Andrew Morton --- include/asm-generic/4level-fixup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index 77ff547730af..5bdab6bffd23 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -4,6 +4,7 @@ #define __ARCH_HAS_4LEVEL_HACK #define __PAGETABLE_PUD_FOLDED +#define PUD_SHIFT PGDIR_SHIFT #define PUD_SIZE PGDIR_SIZE #define PUD_MASK PGDIR_MASK #define PTRS_PER_PUD 1 -- cgit v1.2.3 From af9f62c1f1513a61a3b1ef5c6b6c6b03a78fdf23 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:50 +1100 Subject: mm: account pmd page tables to the process Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include #include #include #include #include #include #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov Reported-by: Dave Hansen Cc: Hugh Dickins Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Signed-off-by: Andrew Morton --- Documentation/sysctl/vm.txt | 12 ++++++------ arch/x86/mm/pgtable.c | 13 ++++++++----- fs/proc/task_mmu.c | 9 ++++++--- include/linux/mm.h | 24 ++++++++++++++++++++++++ include/linux/mm_types.h | 5 ++++- kernel/fork.c | 3 +++ mm/debug.c | 3 ++- mm/hugetlb.c | 8 ++++++-- mm/memory.c | 2 ++ mm/mmap.c | 4 +++- mm/oom_kill.c | 9 +++++---- 11 files changed, 69 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 4415aa915681..e9c706e4627a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -555,12 +555,12 @@ this is causing problems for your system/application. oom_dump_tasks -Enables a system-wide task dump (excluding kernel threads) to be -produced when the kernel performs an OOM-killing and includes such -information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, -oom_score_adj score, and name. This is helpful to determine why the -OOM killer was invoked, to identify the rogue task that caused it, -and to determine why the OOM killer chose the task it did to kill. +Enables a system-wide task dump (excluding kernel threads) to be produced +when the kernel performs an OOM-killing and includes such information as +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..a7d36de0bd30 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; @@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) pmd = NULL; failed = true; } + if (pmd) + mm_inc_nr_pmds(mm); pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds); return -ENOMEM; } @@ -283,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (preallocate_pmds(mm, pmds) != 0) goto out_free_pgd; if (paravirt_pgd_alloc(mm) != 0) @@ -304,7 +307,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; out_free_pmds: - free_pmds(pmds); + free_pmds(mm, pmds); out_free_pgd: free_page((unsigned long)pgd); out: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fa80d0c08a46..f610dc6ac182 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -21,7 +21,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap; + unsigned long data, text, lib, swap, ptes, pmds; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" + "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE * sizeof(pte_t) * - atomic_long_read(&mm->nr_ptes)) >> 10, + ptes >> 10, + pmds >> 10, swap << (PAGE_SHIFT-10)); } diff --git a/include/linux/mm.h b/include/linux/mm.h index a229a17fb79c..3a504e97f277 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1431,8 +1431,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, { return 0; } + +static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} +static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} + #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); + +static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_pmds); +} + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_pmds); +} + +static inline void mm_dec_nr_pmds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_pmds); +} #endif int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 20ff2105b564..79cdf6f5c746 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -363,7 +363,10 @@ struct mm_struct { pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ - atomic_long_t nr_ptes; /* Page table pages */ + atomic_long_t nr_ptes; /* PTE page table pages */ +#ifndef __PAGETABLE_PMD_FOLDED + atomic_long_t nr_pmds; /* PMD page table pages */ +#endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ diff --git a/kernel/fork.c b/kernel/fork.c index b379d9abddc7..c99098c52641 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); +#ifndef __PAGETABLE_PMD_FOLDED + atomic_long_set(&mm->nr_pmds, 0); +#endif mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; diff --git a/mm/debug.c b/mm/debug.c index d69cb5a7ba9a..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4381d667e5de..282a27969bc1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3582,6 +3582,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { + mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -3593,11 +3594,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); spin_lock(ptl); - if (pud_none(*pud)) + if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else + } else { put_page(virt_to_page(spte)); + mm_inc_nr_pmds(mm); + } spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); @@ -3628,6 +3631,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) pud_clear(pud); put_page(virt_to_page(ptep)); + mm_dec_nr_pmds(mm); *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } diff --git a/mm/memory.c b/mm/memory.c index d3d4a644da38..01bcd04fc6dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -3323,6 +3324,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); + mm_inc_nr_pmds(mm); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) /* Another has populated it */ pmd_free(mm, new); diff --git a/mm/mmap.c b/mm/mmap.c index 14d84666e8ba..6a7d36d133fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); WARN_ON(atomic_long_read(&mm->nr_ptes) > - (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT); + WARN_ON(mm_nr_pmds(mm) > + round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b8df76ee2be3..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + - get_mm_counter(p->mm, MM_SWAPENTS); + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); task_unlock(p); /* @@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), atomic_long_read(&task->mm->nr_ptes), + mm_nr_pmds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- cgit v1.2.3 From 332f4bfa110737d3f2e00ba47875cb070df63c5b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 30 Jan 2015 13:11:51 +1100 Subject: mm: add nr_pmds into mm_struct unconditionally __PAGETABLE_PMD_FOLDED is defined during which is not included into . And we cannot include it here since many of needs to define struct page. I failed to come up with better solution rather than put nr_pmds into mm_struct unconditionally. One possible solution would be to expose number of page table levels architecture has via Kconfig, but that's ugly and requires changes to all architectures. Signed-off-by: Kirill A. Shutemov Tested-by: Guenter Roeck Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 79cdf6f5c746..199a03aab8dc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -364,9 +364,7 @@ struct mm_struct { atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ atomic_long_t nr_ptes; /* PTE page table pages */ -#ifndef __PAGETABLE_PMD_FOLDED atomic_long_t nr_pmds; /* PMD page table pages */ -#endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ -- cgit v1.2.3 From b446ee4beda3f334986b38350b93b105075c65d2 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:52 +1100 Subject: mm/compaction: change tracepoint format from decimal to hexadecimal To check the range that compaction is working, tracepoint print start/end pfn of zone and start pfn of both scanner with decimal format. Since we manage all pages in order of 2 and it is well represented by hexadecimal, this patch change the tracepoint format from decimal to hexadecimal. This would improve readability. For example, it makes us easily notice whether current scanner try to compact previously attempted pageblock or not. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index c6814b917bdf..1337d9e01e3d 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -104,7 +104,7 @@ TRACE_EVENT(mm_compaction_begin, __entry->zone_end = zone_end; ), - TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", + TP_printk("zone_start=0x%lx migrate_start=0x%lx free_start=0x%lx zone_end=0x%lx", __entry->zone_start, __entry->migrate_start, __entry->free_start, -- cgit v1.2.3 From f0a3d20bc698cb81c7625a1255974daef30d383b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:52 +1100 Subject: mm/compaction: enhance tracepoint output for compaction begin/end We now have tracepoint for begin event of compaction and it prints start position of both scanners, but, tracepoint for end event of compaction doesn't print finish position of both scanners. It'd be also useful to know finish position of both scanners so this patch add it. It will help to find odd behavior or problem on compaction internal logic. And mode is added to both begin/end tracepoint output, since according to mode, compaction behavior is quite different. And lastly, status format is changed to string rather than status number for readability. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/compaction.h | 2 ++ include/trace/events/compaction.h | 49 ++++++++++++++++++++++++++++----------- mm/compaction.c | 14 +++++++++-- 3 files changed, 49 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index f2efda2e6ac6..f876f0e08351 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -12,6 +12,7 @@ #define COMPACT_PARTIAL 3 /* The full zone was compacted */ #define COMPACT_COMPLETE 4 +/* When adding new state, please change compaction_status_string, too */ /* Used to signal whether compaction detected need_sched() or lock contention */ /* No contention detected */ @@ -24,6 +25,7 @@ struct alloc_context; /* in mm/internal.h */ #ifdef CONFIG_COMPACTION +extern char *compaction_status_string[]; extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 1337d9e01e3d..839f6fac921a 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -85,46 +85,67 @@ TRACE_EVENT(mm_compaction_migratepages, ); TRACE_EVENT(mm_compaction_begin, - TP_PROTO(unsigned long zone_start, unsigned long migrate_start, - unsigned long free_start, unsigned long zone_end), + TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, + unsigned long free_pfn, unsigned long zone_end, bool sync), - TP_ARGS(zone_start, migrate_start, free_start, zone_end), + TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync), TP_STRUCT__entry( __field(unsigned long, zone_start) - __field(unsigned long, migrate_start) - __field(unsigned long, free_start) + __field(unsigned long, migrate_pfn) + __field(unsigned long, free_pfn) __field(unsigned long, zone_end) + __field(bool, sync) ), TP_fast_assign( __entry->zone_start = zone_start; - __entry->migrate_start = migrate_start; - __entry->free_start = free_start; + __entry->migrate_pfn = migrate_pfn; + __entry->free_pfn = free_pfn; __entry->zone_end = zone_end; + __entry->sync = sync; ), - TP_printk("zone_start=0x%lx migrate_start=0x%lx free_start=0x%lx zone_end=0x%lx", + TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s", __entry->zone_start, - __entry->migrate_start, - __entry->free_start, - __entry->zone_end) + __entry->migrate_pfn, + __entry->free_pfn, + __entry->zone_end, + __entry->sync ? "sync" : "async") ); TRACE_EVENT(mm_compaction_end, - TP_PROTO(int status), + TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, + unsigned long free_pfn, unsigned long zone_end, bool sync, + int status), - TP_ARGS(status), + TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status), TP_STRUCT__entry( + __field(unsigned long, zone_start) + __field(unsigned long, migrate_pfn) + __field(unsigned long, free_pfn) + __field(unsigned long, zone_end) + __field(bool, sync) __field(int, status) ), TP_fast_assign( + __entry->zone_start = zone_start; + __entry->migrate_pfn = migrate_pfn; + __entry->free_pfn = free_pfn; + __entry->zone_end = zone_end; + __entry->sync = sync; __entry->status = status; ), - TP_printk("status=%d", __entry->status) + TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s", + __entry->zone_start, + __entry->migrate_pfn, + __entry->free_pfn, + __entry->zone_end, + __entry->sync ? "sync" : "async", + compaction_status_string[__entry->status]) ); #endif /* _TRACE_COMPACTION_H */ diff --git a/mm/compaction.c b/mm/compaction.c index 9c7e6909dd29..7bfe49ad7833 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -19,6 +19,14 @@ #include "internal.h" #ifdef CONFIG_COMPACTION +char *compaction_status_string[] = { + "deferred", + "skipped", + "continue", + "partial", + "complete", +}; + static inline void count_compact_event(enum vm_event_item item) { count_vm_event(item); @@ -1197,7 +1205,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } - trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, + cc->free_pfn, end_pfn, sync); migrate_prep_local(); @@ -1299,7 +1308,8 @@ out: zone->compact_cached_free_pfn = free_pfn; } - trace_mm_compaction_end(ret); + trace_mm_compaction_end(start_pfn, cc->migrate_pfn, + cc->free_pfn, end_pfn, sync, ret); return ret; } -- cgit v1.2.3 From 651cb7dbe803c370e3a3b7fecb6aa0e66043d23b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:52 +1100 Subject: mm-compaction-enhance-tracepoint-output-for-compaction-begin-end-v4 Changes from v3: Build fix for !CONFIG_COMPACTION, !CONFIG_TRACEPOINTS Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/compaction.h | 1 - mm/compaction.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index f876f0e08351..db64cae06530 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -25,7 +25,6 @@ struct alloc_context; /* in mm/internal.h */ #ifdef CONFIG_COMPACTION -extern char *compaction_status_string[]; extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); diff --git a/mm/compaction.c b/mm/compaction.c index 7bfe49ad7833..9e3c4504fc09 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -19,14 +19,6 @@ #include "internal.h" #ifdef CONFIG_COMPACTION -char *compaction_status_string[] = { - "deferred", - "skipped", - "continue", - "partial", - "complete", -}; - static inline void count_compact_event(enum vm_event_item item) { count_vm_event(item); @@ -42,6 +34,15 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA +#ifdef CONFIG_TRACEPOINTS +static const char const *compaction_status_string[] = { + "deferred", + "skipped", + "continue", + "partial", + "complete", +}; +#endif #define CREATE_TRACE_POINTS #include -- cgit v1.2.3 From 1347a2f68c54a65c24a7a112f4b697878b865371 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:53 +1100 Subject: mm/compaction: print current range where compaction work It'd be useful to know current range where compaction work for detailed analysis. With it, we can know pageblock where we actually scan and isolate, and, how much pages we try in that pageblock and can guess why it doesn't become freepage with pageblock order roughly. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 30 +++++++++++++++++++++++------- mm/compaction.c | 9 ++++++--- 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 839f6fac921a..139020b55612 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -11,39 +11,55 @@ DECLARE_EVENT_CLASS(mm_compaction_isolate_template, - TP_PROTO(unsigned long nr_scanned, + TP_PROTO( + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long nr_scanned, unsigned long nr_taken), - TP_ARGS(nr_scanned, nr_taken), + TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken), TP_STRUCT__entry( + __field(unsigned long, start_pfn) + __field(unsigned long, end_pfn) __field(unsigned long, nr_scanned) __field(unsigned long, nr_taken) ), TP_fast_assign( + __entry->start_pfn = start_pfn; + __entry->end_pfn = end_pfn; __entry->nr_scanned = nr_scanned; __entry->nr_taken = nr_taken; ), - TP_printk("nr_scanned=%lu nr_taken=%lu", + TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu", + __entry->start_pfn, + __entry->end_pfn, __entry->nr_scanned, __entry->nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, - TP_PROTO(unsigned long nr_scanned, + TP_PROTO( + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long nr_scanned, unsigned long nr_taken), - TP_ARGS(nr_scanned, nr_taken) + TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, - TP_PROTO(unsigned long nr_scanned, + + TP_PROTO( + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long nr_scanned, unsigned long nr_taken), - TP_ARGS(nr_scanned, nr_taken) + TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); TRACE_EVENT(mm_compaction_migratepages, diff --git a/mm/compaction.c b/mm/compaction.c index 66f7c365e888..b12df9fe10b4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -430,11 +430,12 @@ isolate_fail: } + trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, + nr_scanned, total_isolated); + /* Record how far we have got within the block */ *start_pfn = blockpfn; - trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); - /* * If strict isolation is requested by CMA then check that all the * pages requested were isolated. If there were any failures, 0 is @@ -590,6 +591,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; + unsigned long start_pfn = low_pfn; /* * Ensure that there are not too many pages isolated from the LRU @@ -750,7 +752,8 @@ isolate_success: if (low_pfn == end_pfn) update_pageblock_skip(cc, valid_page, nr_isolated, true); - trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, + nr_scanned, nr_isolated); count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); if (nr_isolated) -- cgit v1.2.3 From 647c2eb4368c08fdf5dcfcbd2076f18c46bf01b4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:53 +1100 Subject: mm/compaction: more trace to understand when/why compaction start/finish It is not well analyzed that when/why compaction start/finish or not. With these new tracepoints, we can know much more about start/finish reason of compaction. I can find following bug with these tracepoint. http://www.spinics.net/lists/linux-mm/msg81582.html Signed-off-by: Joonsoo Kim Cc: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/compaction.h | 3 ++ include/trace/events/compaction.h | 74 +++++++++++++++++++++++++++++++++++++++ mm/compaction.c | 38 +++++++++++++++++--- 3 files changed, 111 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index db64cae06530..501d7513aac1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -12,6 +12,9 @@ #define COMPACT_PARTIAL 3 /* The full zone was compacted */ #define COMPACT_COMPLETE 4 +/* For more detailed tracepoint output */ +#define COMPACT_NO_SUITABLE_PAGE 5 +#define COMPACT_NOT_SUITABLE_ZONE 6 /* When adding new state, please change compaction_status_string, too */ /* Used to signal whether compaction detected need_sched() or lock contention */ diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 139020b55612..d46535801f63 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -164,6 +164,80 @@ TRACE_EVENT(mm_compaction_end, compaction_status_string[__entry->status]) ); +TRACE_EVENT(mm_compaction_try_to_compact_pages, + + TP_PROTO( + int order, + gfp_t gfp_mask, + enum migrate_mode mode), + + TP_ARGS(order, gfp_mask, mode), + + TP_STRUCT__entry( + __field(int, order) + __field(gfp_t, gfp_mask) + __field(enum migrate_mode, mode) + ), + + TP_fast_assign( + __entry->order = order; + __entry->gfp_mask = gfp_mask; + __entry->mode = mode; + ), + + TP_printk("order=%d gfp_mask=0x%x mode=%d", + __entry->order, + __entry->gfp_mask, + (int)__entry->mode) +); + +DECLARE_EVENT_CLASS(mm_compaction_suitable_template, + + TP_PROTO(struct zone *zone, + int order, + int ret), + + TP_ARGS(zone, order, ret), + + TP_STRUCT__entry( + __field(int, nid) + __field(char *, name) + __field(int, order) + __field(int, ret) + ), + + TP_fast_assign( + __entry->nid = zone_to_nid(zone); + __entry->name = (char *)zone->name; + __entry->order = order; + __entry->ret = ret; + ), + + TP_printk("node=%d zone=%-8s order=%d ret=%s", + __entry->nid, + __entry->name, + __entry->order, + compaction_status_string[__entry->ret]) +); + +DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, + + TP_PROTO(struct zone *zone, + int order, + int ret), + + TP_ARGS(zone, order, ret) +); + +DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, + + TP_PROTO(struct zone *zone, + int order, + int ret), + + TP_ARGS(zone, order, ret) +); + #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ diff --git a/mm/compaction.c b/mm/compaction.c index b12df9fe10b4..b6ede459c1bb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -41,6 +41,8 @@ static const char *const compaction_status_string[] = { "continue", "partial", "complete", + "no_suitable_page", + "not_suitable_zone", }; #endif @@ -1049,7 +1051,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } -static int compact_finished(struct zone *zone, struct compact_control *cc, +static int __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; @@ -1104,7 +1106,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_PARTIAL; } - return COMPACT_CONTINUE; + return COMPACT_NO_SUITABLE_PAGE; +} + +static int compact_finished(struct zone *zone, struct compact_control *cc, + const int migratetype) +{ + int ret; + + ret = __compact_finished(zone, cc, migratetype); + trace_mm_compaction_finished(zone, cc->order, ret); + if (ret == COMPACT_NO_SUITABLE_PAGE) + ret = COMPACT_CONTINUE; + + return ret; } /* @@ -1114,7 +1129,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -unsigned long compaction_suitable(struct zone *zone, int order, +static unsigned long __compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { int fragindex; @@ -1158,11 +1173,24 @@ unsigned long compaction_suitable(struct zone *zone, int order, */ fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - return COMPACT_SKIPPED; + return COMPACT_NOT_SUITABLE_ZONE; return COMPACT_CONTINUE; } +unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) +{ + unsigned long ret; + + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + trace_mm_compaction_suitable(zone, order, ret); + if (ret == COMPACT_NOT_SUITABLE_ZONE) + ret = COMPACT_SKIPPED; + + return ret; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1376,6 +1404,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, if (!order || !may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; + trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); + /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { -- cgit v1.2.3 From a0d5e2cc5841378f503b1e5ee3f7f76a6e66cb71 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:53 +1100 Subject: mm/compaction: add tracepoint to observe behaviour of compaction defer Compaction deferring logic is heavy hammer that block the way to the compaction. It doesn't consider overall system state, so it could prevent user from doing compaction falsely. In other words, even if system has enough range of memory to compact, compaction would be skipped due to compaction deferring logic. This patch add new tracepoint to understand work of deferring logic. This will also help to check compaction success and fail. Signed-off-by: Joonsoo Kim Cc: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/compaction.h | 65 +++-------------------------------- include/trace/events/compaction.h | 54 +++++++++++++++++++++++++++++ mm/compaction.c | 72 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 501d7513aac1..a014559e4a49 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -44,66 +44,11 @@ extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx); -/* Do not skip compaction more than 64 times */ -#define COMPACT_MAX_DEFER_SHIFT 6 - -/* - * Compaction is deferred when compaction fails to result in a page - * allocation success. 1 << compact_defer_limit compactions are skipped up - * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT - */ -static inline void defer_compaction(struct zone *zone, int order) -{ - zone->compact_considered = 0; - zone->compact_defer_shift++; - - if (order < zone->compact_order_failed) - zone->compact_order_failed = order; - - if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) - zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; -} - -/* Returns true if compaction should be skipped this time */ -static inline bool compaction_deferred(struct zone *zone, int order) -{ - unsigned long defer_limit = 1UL << zone->compact_defer_shift; - - if (order < zone->compact_order_failed) - return false; - - /* Avoid possible overflow */ - if (++zone->compact_considered > defer_limit) - zone->compact_considered = defer_limit; - - return zone->compact_considered < defer_limit; -} - -/* - * Update defer tracking counters after successful compaction of given order, - * which means an allocation either succeeded (alloc_success == true) or is - * expected to succeed. - */ -static inline void compaction_defer_reset(struct zone *zone, int order, - bool alloc_success) -{ - if (alloc_success) { - zone->compact_considered = 0; - zone->compact_defer_shift = 0; - } - if (order >= zone->compact_order_failed) - zone->compact_order_failed = order + 1; -} - -/* Returns true if restarting compaction after many failures */ -static inline bool compaction_restarting(struct zone *zone, int order) -{ - if (order < zone->compact_order_failed) - return false; - - return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && - zone->compact_considered >= 1UL << zone->compact_defer_shift; -} +extern void defer_compaction(struct zone *zone, int order); +extern bool compaction_deferred(struct zone *zone, int order); +extern void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success); +extern bool compaction_restarting(struct zone *zone, int order); #else static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index d46535801f63..a770166a6b63 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -238,6 +238,60 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_ARGS(zone, order, ret) ); +DECLARE_EVENT_CLASS(mm_compaction_defer_template, + + TP_PROTO(struct zone *zone, int order), + + TP_ARGS(zone, order), + + TP_STRUCT__entry( + __field(int, nid) + __field(char *, name) + __field(int, order) + __field(unsigned int, considered) + __field(unsigned int, defer_shift) + __field(int, order_failed) + ), + + TP_fast_assign( + __entry->nid = zone_to_nid(zone); + __entry->name = (char *)zone->name; + __entry->order = order; + __entry->considered = zone->compact_considered; + __entry->defer_shift = zone->compact_defer_shift; + __entry->order_failed = zone->compact_order_failed; + ), + + TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", + __entry->nid, + __entry->name, + __entry->order, + __entry->order_failed, + __entry->considered, + 1UL << __entry->defer_shift) +); + +DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred, + + TP_PROTO(struct zone *zone, int order), + + TP_ARGS(zone, order) +); + +DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction, + + TP_PROTO(struct zone *zone, int order), + + TP_ARGS(zone, order) +); + +DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, + + TP_PROTO(struct zone *zone, int order), + + TP_ARGS(zone, order) +); + #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ diff --git a/mm/compaction.c b/mm/compaction.c index b6ede459c1bb..8cb5e997b080 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -124,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn, } #ifdef CONFIG_COMPACTION + +/* Do not skip compaction more than 64 times */ +#define COMPACT_MAX_DEFER_SHIFT 6 + +/* + * Compaction is deferred when compaction fails to result in a page + * allocation success. 1 << compact_defer_limit compactions are skipped up + * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT + */ +void defer_compaction(struct zone *zone, int order) +{ + zone->compact_considered = 0; + zone->compact_defer_shift++; + + if (order < zone->compact_order_failed) + zone->compact_order_failed = order; + + if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) + zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; + + trace_mm_compaction_defer_compaction(zone, order); +} + +/* Returns true if compaction should be skipped this time */ +bool compaction_deferred(struct zone *zone, int order) +{ + unsigned long defer_limit = 1UL << zone->compact_defer_shift; + + if (order < zone->compact_order_failed) + return false; + + /* Avoid possible overflow */ + if (++zone->compact_considered > defer_limit) + zone->compact_considered = defer_limit; + + if (zone->compact_considered >= defer_limit) + return false; + + trace_mm_compaction_deferred(zone, order); + + return true; +} + +/* + * Update defer tracking counters after successful compaction of given order, + * which means an allocation either succeeded (alloc_success == true) or is + * expected to succeed. + */ +void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success) +{ + if (alloc_success) { + zone->compact_considered = 0; + zone->compact_defer_shift = 0; + } + if (order >= zone->compact_order_failed) + zone->compact_order_failed = order + 1; + + trace_mm_compaction_defer_reset(zone, order); +} + +/* Returns true if restarting compaction after many failures */ +bool compaction_restarting(struct zone *zone, int order) +{ + if (order < zone->compact_order_failed) + return false; + + return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && + zone->compact_considered >= 1UL << zone->compact_defer_shift; +} + /* Returns true if the pageblock should be scanned for pages to isolate. */ static inline bool isolation_suitable(struct compact_control *cc, struct page *page) @@ -1435,6 +1506,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, * succeeds in this zone. */ compaction_defer_reset(zone, order, false); + /* * It is possible that async compaction aborted due to * need_resched() and the watermarks were ok thanks to -- cgit v1.2.3 From 7678b8f4261aec040b116fb470e9212d6bdf2343 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 30 Jan 2015 13:11:53 +1100 Subject: mm-compaction-add-tracepoint-to-observe-behaviour-of-compaction-defer-v4 Changes from v3: Build fix for !CONFIG_COMPACTION Signed-off-by: Joonsoo Kim Cc: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 2 ++ mm/compaction.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index a770166a6b63..9a6a3fe0fb51 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -238,6 +238,7 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_ARGS(zone, order, ret) ); +#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), @@ -291,6 +292,7 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_ARGS(zone, order) ); +#endif #endif /* _TRACE_COMPACTION_H */ diff --git a/mm/compaction.c b/mm/compaction.c index 8cb5e997b080..b68736c8a1ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1506,7 +1506,6 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, * succeeds in this zone. */ compaction_defer_reset(zone, order, false); - /* * It is possible that async compaction aborted due to * need_resched() and the watermarks were ok thanks to -- cgit v1.2.3 From a4056b57e9a5c886331a9465219d1627edf3f107 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 30 Jan 2015 13:11:54 +1100 Subject: mm/thp: allocate transparent hugepages on local node This make sure that we try to allocate hugepages from local node if allowed by mempolicy. If we can't, we fallback to small page allocation based on mempolicy. This is based on the observation that allocating pages on local node is more beneficial than allocating hugepages on remote node. With this patch applied we may find transparent huge page allocation failures if the current node doesn't have enough freee hugepages. Before this patch such failures result in us retrying the allocation on other nodes in the numa node mask. Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- include/linux/gfp.h | 4 +++ mm/huge_memory.c | 24 +++++++----------- mm/mempolicy.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b840e3b2770d..60110e06419d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -335,11 +335,15 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node); +extern struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma, + unsigned long addr, int order); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) #define alloc_pages_vma(gfp_mask, order, vma, addr, node) \ alloc_pages(gfp_mask, order) +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5e4807431427..c02ebe02a5c4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -761,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; } -static inline struct page *alloc_hugepage_vma(int defrag, - struct vm_area_struct *vma, - unsigned long haddr, int nd, - gfp_t extra_gfp) -{ - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), - HPAGE_PMD_ORDER, vma, haddr, nd); -} - /* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -790,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { + gfp_t gfp; struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -824,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return 0; } - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1113,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); - else + !transparent_hugepage_debug_cow()) { + gfp_t gfp; + + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + } else new_page = NULL; if (unlikely(!new_page)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4fcbf12deaa1..39cd075cf144 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2030,6 +2030,76 @@ retry_cpuset: return page; } +/** + * alloc_hugepage_vma: Allocate a hugepage for a VMA + * @gfp: + * %GFP_USER user allocation. + * %GFP_KERNEL kernel allocations, + * %GFP_HIGHMEM highmem/user allocations, + * %GFP_FS allocation should not call back into a file system. + * %GFP_ATOMIC don't sleep. + * + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual Address of the allocation. Must be inside the VMA. + * @order: Order of the hugepage for gfp allocation. + * + * This functions allocate a huge page from the kernel page pool and applies + * a NUMA policy associated with the VMA or the current process. + * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage + * only from the current node if the current node is part of the node mask. + * If we can't allocate a hugepage we fail the allocation and don' try to fallback + * to other nodes in the node mask. If the current node is not part of node mask + * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can + * fallback to nodes in the policy node mask. + * + * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * mm_struct of the VMA to prevent it from going away. Should be used for + * all allocations for pages that will be mapped into + * user space. Returns NULL when no page can be allocated. + * + * Should be called with the mm_sem of the vma hold. + * + */ +struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma, + unsigned long addr, int order) +{ + struct page *page; + nodemask_t *nmask; + struct mempolicy *pol; + int node = numa_node_id(); + unsigned int cpuset_mems_cookie; + +retry_cpuset: + pol = get_vma_policy(vma, addr); + cpuset_mems_cookie = read_mems_allowed_begin(); + /* + * For interleave policy, we don't worry about + * current node. Otherwise if current node is + * in nodemask, try to allocate hugepage from + * the current node. Don't fall back to other nodes + * for THP. + */ + if (unlikely(pol->mode == MPOL_INTERLEAVE)) + goto alloc_with_fallback; + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(node, *nmask)) { + mpol_cond_put(pol); + page = alloc_pages_exact_node(node, gfp, order); + if (unlikely(!page && + read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; + } +alloc_with_fallback: + mpol_cond_put(pol); + /* + * if current node is not part of node mask, try + * the allocation from any node, and we can do retry + * in that case. + */ + return alloc_pages_vma(gfp, order, vma, addr, node); +} + /** * alloc_pages_current - Allocate pages. * -- cgit v1.2.3 From 3a50999526df64b32c2cbd98c8c5363fdc796cd4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 30 Jan 2015 13:11:54 +1100 Subject: mm: gup: add get_user_pages_locked and get_user_pages_unlocked FAULT_FOLL_ALLOW_RETRY allows the page fault to drop the mmap_sem for reading to reduce the mmap_sem contention (for writing), like while waiting for I/O completion. The problem is that right now practically no get_user_pages call uses FAULT_FOLL_ALLOW_RETRY, so we're not leveraging that nifty feature. Andres fixed it for the KVM page fault. However get_user_pages_fast remains uncovered, and 99% of other get_user_pages aren't using it either (the only exception being FOLL_NOWAIT in KVM which is really nonblocking and in fact it doesn't even release the mmap_sem). So this patchsets extends the optimization Andres did in the KVM page fault to the whole kernel. It makes most important places (including gup_fast) to use FAULT_FOLL_ALLOW_RETRY to reduce the mmap_sem hold times during I/O. The only few places that remains uncovered are drivers like v4l and other exceptions that tends to work on their own memory and they're not working on random user memory (for example like O_DIRECT that uses gup_fast and is fully covered by this patch). A follow up patch should probably also add a printk_once warning to get_user_pages that should go obsolete and be phased out eventually. The "vmas" parameter of get_user_pages makes it fundamentally incompatible with FAULT_FOLL_ALLOW_RETRY (vmas array becomes meaningless the moment the mmap_sem is released). While this is just an optimization, this becomes an absolute requirement for the userfaultfd feature http://lwn.net/Articles/615086/ . The userfaultfd allows to block the page fault, and in order to do so I need to drop the mmap_sem first. So this patch also ensures that all memory where userfaultfd could be registered by KVM, the very first fault (no matter if it is a regular page fault, or a get_user_pages) always has FAULT_FOLL_ALLOW_RETRY set. Then the userfaultfd blocks and it is waken only when the pagetable is already mapped. The second fault attempt after the wakeup doesn't need FAULT_FOLL_ALLOW_RETRY, so it's ok to retry without it. This patch (of 5): We can leverage the VM_FAULT_RETRY functionality in the page fault paths better by using either get_user_pages_locked or get_user_pages_unlocked. The former allows conversion of get_user_pages invocations that will have to pass a "&locked" parameter to know if the mmap_sem was dropped during the call. Example from: down_read(&mm->mmap_sem); do_something() get_user_pages(tsk, mm, ..., pages, NULL); up_read(&mm->mmap_sem); to: int locked = 1; down_read(&mm->mmap_sem); do_something() get_user_pages_locked(tsk, mm, ..., pages, &locked); if (locked) up_read(&mm->mmap_sem); The latter is suitable only as a drop in replacement of the form: down_read(&mm->mmap_sem); get_user_pages(tsk, mm, ..., pages, NULL); up_read(&mm->mmap_sem); into: get_user_pages_unlocked(tsk, mm, ..., pages); Where tsk, mm, the intermediate "..." paramters and "pages" can be any value as before. Just the last parameter of get_user_pages (vmas) must be NULL for get_user_pages_locked|unlocked to be usable (the latter original form wouldn't have been safe anyway if vmas wasn't null, for the former we just make it explicit by dropping the parameter). If vmas is not NULL these two methods cannot be used. Signed-off-by: Andrea Arcangeli Reviewed-by: Andres Lagar-Cavilla Reviewed-by: Peter Feiner Reviewed-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++ mm/gup.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++---- mm/nommu.c | 23 +++++++ 3 files changed, 196 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a504e97f277..467081587edf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1254,6 +1254,13 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked); +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct kvec; diff --git a/mm/gup.c b/mm/gup.c index 779b2f3a69fe..47e070ab9e4a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -575,6 +575,165 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return 0; } +static __always_inline long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + int write, int force, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, bool notify_drop) +{ + int flags = FOLL_TOUCH; + long ret, pages_done; + bool lock_dropped; + + if (locked) { + /* if VM_FAULT_RETRY can be returned, vmas become invalid */ + BUG_ON(vmas); + /* check caller initialized locked */ + BUG_ON(*locked != 1); + } + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + pages_done = 0; + lock_dropped = false; + for (;;) { + ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, + vmas, locked); + if (!locked) + /* VM_FAULT_RETRY couldn't trigger, bypass */ + return ret; + + /* VM_FAULT_RETRY cannot return errors */ + if (!*locked) { + BUG_ON(ret < 0); + BUG_ON(ret >= nr_pages); + } + + if (!pages) + /* If it's a prefault don't insist harder */ + return ret; + + if (ret > 0) { + nr_pages -= ret; + pages_done += ret; + if (!nr_pages) + break; + } + if (*locked) { + /* VM_FAULT_RETRY didn't trigger */ + if (!pages_done) + pages_done = ret; + break; + } + /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ + pages += ret; + start += ret << PAGE_SHIFT; + + /* + * Repeat on the address that fired VM_FAULT_RETRY + * without FAULT_FLAG_ALLOW_RETRY but with + * FAULT_FLAG_TRIED. + */ + *locked = 1; + lock_dropped = true; + down_read(&mm->mmap_sem); + ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, + pages, NULL, NULL); + if (ret != 1) { + BUG_ON(ret > 1); + if (!pages_done) + pages_done = ret; + break; + } + nr_pages--; + pages_done++; + if (!nr_pages) + break; + pages++; + start += PAGE_SIZE; + } + if (notify_drop && lock_dropped && *locked) { + /* + * We must let the caller know we temporarily dropped the lock + * and so the critical section protected by it was lost. + */ + up_read(&mm->mmap_sem); + *locked = 0; + } + return pages_done; +} + +/* + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * + * get_user_pages_locked() is suitable to replace the form: + * + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * to: + * + * int locked = 1; + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * if (locked) + * up_read(&mm->mmap_sem); + */ +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, locked, true); +} +EXPORT_SYMBOL(get_user_pages_locked); + +/* + * get_user_pages_unlocked() is suitable to replace the form: + * + * down_read(&mm->mmap_sem); + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * with: + * + * get_user_pages_unlocked(tsk, mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead, if the two parameters + * "tsk" and "mm" are respectively equal to current and current->mm, + * or if "force" shall be set to 1 (get_user_pages_fast misses the + * "force" parameter). + */ +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + long ret; + int locked = 1; + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, &locked, false); + if (locked) + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(get_user_pages_unlocked); + /* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or @@ -624,22 +783,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. + * + * get_user_pages should be phased out in favor of + * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing + * should use get_user_pages because it cannot pass + * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, vmas, NULL, false); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/nommu.c b/mm/nommu.c index 541bed64e348..bfb690b0f986 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -214,6 +214,29 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return get_user_pages(tsk, mm, start, nr_pages, write, force, + pages, NULL); +} +EXPORT_SYMBOL(get_user_pages_locked); + +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + long ret; + down_read(&mm->mmap_sem); + ret = get_user_pages(tsk, mm, start, nr_pages, write, force, + pages, NULL); + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(get_user_pages_unlocked); + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping -- cgit v1.2.3 From 5a229d73b42e30bdc6da6368dc8bcdfd6c8bef6d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 30 Jan 2015 13:11:55 +1100 Subject: mm: gup: add __get_user_pages_unlocked to customize gup_flags Some callers (like KVM) may want to set the gup_flags like FOLL_HWPOSION to get a proper -EHWPOSION retval instead of -EFAULT to take a more appropriate action if get_user_pages runs into a memory failure. Signed-off-by: Andrea Arcangeli Reviewed-by: Kirill A. Shutemov Cc: Andres Lagar-Cavilla Cc: Peter Feiner Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++++ mm/gup.c | 44 ++++++++++++++++++++++++++++++++------------ mm/nommu.c | 16 +++++++++++++--- 3 files changed, 49 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 467081587edf..b3599e3bc2dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1258,6 +1258,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked); +long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags); long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index 47e070ab9e4a..8b60daf9af93 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -582,9 +582,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, int write, int force, struct page **pages, struct vm_area_struct **vmas, - int *locked, bool notify_drop) + int *locked, bool notify_drop, + unsigned int flags) { - int flags = FOLL_TOUCH; long ret, pages_done; bool lock_dropped; @@ -698,10 +698,36 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, locked, true); + pages, NULL, locked, true, FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); +/* + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to + * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * + * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the + * caller if required (just like with __get_user_pages). "FOLL_GET", + * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed + * according to the parameters "pages", "write", "force" + * respectively. + */ +__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags) +{ + long ret; + int locked = 1; + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, &locked, false, gup_flags); + if (locked) + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(__get_user_pages_unlocked); + /* * get_user_pages_unlocked() is suitable to replace the form: * @@ -723,14 +749,8 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - long ret; - int locked = 1; - down_read(&mm->mmap_sem); - ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, &locked, false); - if (locked) - up_read(&mm->mmap_sem); - return ret; + return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages, FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -794,7 +814,7 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int force, struct page **pages, struct vm_area_struct **vmas) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false); + pages, vmas, NULL, false, FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/nommu.c b/mm/nommu.c index bfb690b0f986..4d1b8a199867 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -224,9 +224,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_locked); -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) +long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -235,6 +236,15 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } +EXPORT_SYMBOL(__get_user_pages_unlocked); + +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages, 0); +} EXPORT_SYMBOL(get_user_pages_unlocked); /** -- cgit v1.2.3 From 776f9ae9e76fbf96387ed4f3e132565d6d5721d1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 30 Jan 2015 13:11:55 +1100 Subject: mm: gup: kvm use get_user_pages_unlocked Use the more generic get_user_pages_unlocked which has the additional benefit of passing FAULT_FLAG_ALLOW_RETRY at the very first page fault (which allows the first page fault in an unmapped area to be always able to block indefinitely by being allowed to release the mmap_sem). Signed-off-by: Andrea Arcangeli Reviewed-by: Andres Lagar-Cavilla Reviewed-by: Kirill A. Shutemov Cc: Peter Feiner Signed-off-by: Andrew Morton --- include/linux/kvm_host.h | 11 ----------- virt/kvm/async_pf.c | 2 +- virt/kvm/kvm_main.c | 50 ++++-------------------------------------------- 3 files changed, 5 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 26f106022c88..d189ee098aa2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -200,17 +200,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -/* - * Carry out a gup that requires IO. Allow the mm to relinquish the mmap - * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL - * controls whether we retry the gup one more time to completion in that case. - * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp - * handler. - */ -int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, bool write_fault, - struct page **pagep); - enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 5ff7f7f2689a..44660aee335f 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); - kvm_get_user_page_io(NULL, mm, addr, 1, NULL); + get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL); kvm_async_page_present_sync(vcpu, apf); spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cc6e2e19982..458b9b14b15c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1128,43 +1128,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); } -int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, bool write_fault, - struct page **pagep) -{ - int npages; - int locked = 1; - int flags = FOLL_TOUCH | FOLL_HWPOISON | - (pagep ? FOLL_GET : 0) | - (write_fault ? FOLL_WRITE : 0); - - /* - * If retrying the fault, we get here *not* having allowed the filemap - * to wait on the page lock. We should now allow waiting on the IO with - * the mmap semaphore released. - */ - down_read(&mm->mmap_sem); - npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, - &locked); - if (!locked) { - VM_BUG_ON(npages); - - if (!pagep) - return 0; - - /* - * The previous call has now waited on the IO. Now we can - * retry and complete. Pass TRIED to ensure we do not re - * schedule async IO (see e.g. filemap_fault). - */ - down_read(&mm->mmap_sem); - npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED, - pagep, NULL, NULL); - } - up_read(&mm->mmap_sem); - return npages; -} - static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; @@ -1227,15 +1190,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(current, current->mm, addr, write_fault, page); up_read(¤t->mm->mmap_sem); - } else { - /* - * By now we have tried gup_fast, and possibly async_pf, and we - * are certainly not atomic. Time to retry the gup, allowing - * mmap semaphore to be relinquished in the case of IO. - */ - npages = kvm_get_user_page_io(current, current->mm, addr, - write_fault, page); - } + } else + npages = __get_user_pages_unlocked(current, current->mm, addr, 1, + write_fault, 0, page, + FOLL_TOUCH|FOLL_HWPOISON); if (npages != 1) return npages; -- cgit v1.2.3 From 31631a0c6463b46751390dafb6e54b7985e4f373 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 30 Jan 2015 13:11:56 +1100 Subject: mm/pagewalk: remove pgd_entry() and pud_entry() Currently no user of page table walker sets ->pgd_entry() or ->pud_entry(), so checking their existence in each loop is just wasting CPU cycle. So let's remove it to reduce overhead. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ------ mm/pagewalk.c | 9 ++------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b3599e3bc2dc..d62ff38f4891 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1157,8 +1157,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, /** * mm_walk - callbacks for walk_page_range - * @pgd_entry: if set, called for each non-empty PGD (top-level) entry - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to @@ -1172,10 +1170,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * (see walk_page_range for more details) */ struct mm_walk { - int (*pgd_entry)(pgd_t *pgd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pud_entry)(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pte_entry)(pte_t *pte, unsigned long addr, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b264bda46e1b..b793ef149da2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, break; continue; } - if (walk->pud_entry) - err = walk->pud_entry(pud, addr, next, walk); - if (!err && (walk->pmd_entry || walk->pte_entry)) + if (walk->pmd_entry || walk->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) break; @@ -237,10 +235,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd++; continue; } - if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, walk); - if (!err && - (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) + if (walk->pmd_entry || walk->pte_entry) err = walk_pud_range(pgd, addr, next, walk); if (err) break; -- cgit v1.2.3 From 86a0a1bdd01251d3a54950af9f2906f85a1555b2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 30 Jan 2015 13:11:56 +1100 Subject: pagewalk: improve vma handling Current implementation of page table walker has a fundamental problem in vma handling, which started when we tried to handle vma(VM_HUGETLB). Because it's done in pgd loop, considering vma boundary makes code complicated and bug-prone. From the users viewpoint, some user checks some vma-related condition to determine whether the user really does page walk over the vma. In order to solve these, this patch moves vma check outside pgd loop and introduce a new callback ->test_walk(). Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++- mm/pagewalk.c | 206 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 129 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d62ff38f4891..68804915da21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1164,10 +1164,16 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * @pte_entry: if set, called for each non-empty PTE (4th-level) entry * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry - * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry - * is used. + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. A positive returned + * value means "do page table walk over the current vma," + * and a negative one means "abort current page table walk + * right now." 0 means "skip the current vma." + * @mm: mm_struct representing the target process of page table walk + * @vma: vma currently walked (NULL if walking outside vmas) + * @private: private data for callbacks' usage * - * (see walk_page_range for more details) + * (see the comment on walk_page_range() for more details) */ struct mm_walk { int (*pmd_entry)(pmd_t *pmd, unsigned long addr, @@ -1179,7 +1185,10 @@ struct mm_walk { int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk); + int (*test_walk)(unsigned long addr, unsigned long next, + struct mm_walk *walk); struct mm_struct *mm; + struct vm_area_struct *vma; void *private; }; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b793ef149da2..d9cc3caae802 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd_mm(walk->mm, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) @@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) @@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, return boundary < end ? boundary : end; } -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, +static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { + struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); @@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) - return err; + break; } while (addr = next, addr != end); - return 0; + return err; } #else /* CONFIG_HUGETLB_PAGE */ -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, +static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; @@ -135,112 +160,115 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, #endif /* CONFIG_HUGETLB_PAGE */ +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it via the returned value. Return 0 if we do walk over the + * current vma, and return 1 if we skip the vma. Negative values means + * error, where we abort the current walk. + * + * Default check (only VM_PFNMAP check for now) is used when the caller + * doesn't define test_walk() callback. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + if (walk->test_walk) + return walk->test_walk(start, end, walk); + + /* + * Do not walk over vma(VM_PFNMAP), because we have no valid struct + * page backing a VM_PFNMAP range. See also commit a9ff785e4437. + */ + if (vma->vm_flags & VM_PFNMAP) + return 1; + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + + if (vma && is_vm_hugetlb_page(vma)) { + if (walk->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + + return err; +} /** - * walk_page_range - walk a memory map's page tables with a callback - * @addr: starting address - * @end: ending address - * @walk: set of callbacks to invoke for each level of the tree - * - * Recursively walk the page table for the memory area in a VMA, - * calling supplied callbacks. Callbacks are called in-order (first - * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, - * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * walk_page_range - walk page table with caller specific callbacks * - * Each callback receives an entry pointer and the start and end of the - * associated range, and a copy of the original mm_walk for access to - * the ->private or ->mm fields. + * Recursively walk the page table tree of the process represented by @walk->mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. * - * Usually no locks are taken, but splitting transparent huge page may - * take page table lock. And the bottom level iterator will map PTE - * directories from highmem if necessary. + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * purpose. * - * If any callback returns a non-zero value, the walk is aborted and - * the return value is propagated back to the caller. Otherwise 0 is returned. + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @walk->private should be helpful. * - * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry - * is !NULL. + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold + * @walk->mm->mmap_sem, because these function traverse vma list and/or + * access to vma's data. */ -int walk_page_range(unsigned long addr, unsigned long end, +int walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { - pgd_t *pgd; - unsigned long next; int err = 0; + unsigned long next; + struct vm_area_struct *vma; - if (addr >= end) - return err; + if (start >= end) + return -EINVAL; if (!walk->mm) return -EINVAL; VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); - pgd = pgd_offset(walk->mm, addr); + vma = find_vma(walk->mm, start); do { - struct vm_area_struct *vma = NULL; + if (!vma) { /* after the last vma */ + walk->vma = NULL; + next = end; + } else if (start < vma->vm_start) { /* outside vma */ + walk->vma = NULL; + next = min(end, vma->vm_start); + } else { /* inside vma */ + walk->vma = vma; + next = min(end, vma->vm_end); + vma = vma->vm_next; - next = pgd_addr_end(addr, end); - - /* - * This function was not intended to be vma based. - * But there are vma special cases to be handled: - * - hugetlb vma's - * - VM_PFNMAP vma's - */ - vma = find_vma(walk->mm, addr); - if (vma) { - /* - * There are no page structures backing a VM_PFNMAP - * range, so do not allow split_huge_page_pmd(). - */ - if ((vma->vm_start <= addr) && - (vma->vm_flags & VM_PFNMAP)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); + err = walk_page_test(start, next, walk); + if (err > 0) continue; - } - /* - * Handle hugetlb vma individually because pagetable - * walk for the hugetlb page is dependent on the - * architecture and we can't handled it in the same - * manner as non-huge pages. - */ - if (walk->hugetlb_entry && (vma->vm_start <= addr) && - is_vm_hugetlb_page(vma)) { - if (vma->vm_end < next) - next = vma->vm_end; - /* - * Hugepage is very tightly coupled with vma, - * so walk through hugetlb entries within a - * given vma. - */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; - } - } - - if (pgd_none_or_clear_bad(pgd)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); - if (err) + if (err < 0) break; - pgd++; - continue; } - if (walk->pmd_entry || walk->pte_entry) - err = walk_pud_range(pgd, addr, next, walk); + if (walk->vma || walk->pte_hole) + err = __walk_page_range(start, next, walk); if (err) break; - pgd++; - } while (addr = next, addr < end); - + } while (start = next, start < end); return err; } -- cgit v1.2.3 From 8fd835760e36c1d010c7cce98492ccfd1e679eb1 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 30 Jan 2015 13:11:56 +1100 Subject: pagewalk: add walk_page_vma() Introduce walk_page_vma(), which is useful for the callers which want to walk over a given vma. It's used by later patches. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/pagewalk.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 68804915da21..39c7f4f09555 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1194,6 +1194,7 @@ struct mm_walk { int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk); +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d9cc3caae802..4c9a653ba563 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -272,3 +272,21 @@ int walk_page_range(unsigned long start, unsigned long end, } while (start = next, start < end); return err; } + +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +{ + int err; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON(!vma); + walk->vma = vma; + err = walk_page_test(vma->vm_start, vma->vm_end, walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(vma->vm_start, vma->vm_end, walk); +} -- cgit v1.2.3 From bbbdf35d82357eb95fd42f7e4a7a023efaf68b84 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:59 +1100 Subject: slab: embed memcg_cache_params to kmem_cache Currently, kmem_cache stores a pointer to struct memcg_cache_params instead of embedding it. The rationale is to save memory when kmem accounting is disabled. However, the memcg_cache_params has shrivelled drastically since it was first introduced: * Initially: struct memcg_cache_params { bool is_root_cache; union { struct kmem_cache *memcg_caches[0]; struct { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; bool dead; atomic_t nr_pages; struct work_struct destroy; }; }; }; * Now: struct memcg_cache_params { bool is_root_cache; union { struct { struct rcu_head rcu_head; struct kmem_cache *memcg_caches[0]; }; struct { struct mem_cgroup *memcg; struct kmem_cache *root_cache; }; }; }; So the memory saving does not seem to be a clear win anymore. OTOH, keeping a pointer to memcg_cache_params struct instead of embedding it results in touching one more cache line on kmem alloc/free hot paths. Besides, it makes linking kmem caches in a list chained by a field of struct memcg_cache_params really painful due to a level of indirection, while I want to make them linked in the following patch. That said, let us embed it. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Dave Chinner Signed-off-by: Andrew Morton --- include/linux/slab.h | 17 +++---- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- mm/memcontrol.c | 11 ++--- mm/slab.h | 48 +++++++++--------- mm/slab_common.c | 126 ++++++++++++++++++++++++++--------------------- mm/slub.c | 5 +- 7 files changed, 109 insertions(+), 102 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 2e3b448cfa2d..1e03c11bbfbd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -473,14 +473,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + /* * This is the main placeholder for memcg-related information in kmem caches. - * struct kmem_cache will hold a pointer to it, so the memory cost while - * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it - * would otherwise be if that would be bundled in kmem_cache: we'll need an - * extra pointer chase. But the trade off clearly lays in favor of not - * penalizing non-users. - * * Both the root cache and the child caches will have it. For the root cache, * this will hold a dynamically allocated array large enough to hold * information about the currently limited memcgs in the system. To allow the @@ -495,10 +495,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) struct memcg_cache_params { bool is_root_cache; union { - struct { - struct rcu_head rcu_head; - struct kmem_cache *memcg_caches[0]; - }; + struct memcg_cache_array __rcu *memcg_caches; struct { struct mem_cgroup *memcg; struct kmem_cache *root_cache; diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index b869d1662ba3..33d049066c3d 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -70,7 +70,7 @@ struct kmem_cache { int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ #ifdef CONFIG_MEMCG_KMEM - struct memcg_cache_params *memcg_params; + struct memcg_cache_params memcg_params; #endif struct kmem_cache_node *node[MAX_NUMNODES]; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index d82abd40a3c0..9abf04ed0999 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -85,7 +85,7 @@ struct kmem_cache { struct kobject kobj; /* For sysfs */ #endif #ifdef CONFIG_MEMCG_KMEM - struct memcg_cache_params *memcg_params; + struct memcg_cache_params memcg_params; int max_attr_size; /* for propagation, maximum size of a stored attr */ #ifdef CONFIG_SYSFS struct kset *memcg_kset; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a2894013c97d..7ad37627dc6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -332,7 +332,7 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* Index in the kmem_cache->memcg_params->memcg_caches array */ + /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; #endif @@ -531,7 +531,7 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM /* - * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: * this works better in sparse environments, where we have a lot of memcgs, * but only a few kmem-limited. Or also, if we have, for instance, 200 @@ -2667,8 +2667,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; - VM_BUG_ON(!cachep->memcg_params); - VM_BUG_ON(!cachep->memcg_params->is_root_cache); + VM_BUG_ON(!is_root_cache(cachep)); if (current->memcg_kmem_skip_account) return cachep; @@ -2702,7 +2701,7 @@ out: void __memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params->memcg->css); + css_put(&cachep->memcg_params.memcg->css); } /* @@ -2778,7 +2777,7 @@ struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) if (PageSlab(page)) { cachep = page->slab_cache; if (!is_root_cache(cachep)) - memcg = cachep->memcg_params->memcg; + memcg = cachep->memcg_params.memcg; } else /* page allocated by alloc_kmem_pages */ memcg = page->mem_cgroup; diff --git a/mm/slab.h b/mm/slab.h index 90430d6f665e..53a623f85931 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); -struct mem_cgroup; - int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)); @@ -167,14 +165,13 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, #ifdef CONFIG_MEMCG_KMEM static inline bool is_root_cache(struct kmem_cache *s) { - return !s->memcg_params || s->memcg_params->is_root_cache; + return s->memcg_params.is_root_cache; } static inline bool slab_equal_or_root(struct kmem_cache *s, - struct kmem_cache *p) + struct kmem_cache *p) { - return (p == s) || - (s->memcg_params && (p == s->memcg_params->root_cache)); + return p == s || p == s->memcg_params.root_cache; } /* @@ -185,37 +182,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, static inline const char *cache_name(struct kmem_cache *s) { if (!is_root_cache(s)) - return s->memcg_params->root_cache->name; + s = s->memcg_params.root_cache; return s->name; } /* * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. - * That said the caller must assure the memcg's cache won't go away. Since once - * created a memcg's cache is destroyed only along with the root cache, it is - * true if we are going to allocate from the cache or hold a reference to the - * root cache by other means. Otherwise, we should hold either the slab_mutex - * or the memcg's slab_caches_mutex while calling this function and accessing - * the returned value. + * That said the caller must assure the memcg's cache won't go away by either + * taking a css reference to the owner cgroup, or holding the slab_mutex. */ static inline struct kmem_cache * cache_from_memcg_idx(struct kmem_cache *s, int idx) { struct kmem_cache *cachep; - struct memcg_cache_params *params; - - if (!s->memcg_params) - return NULL; + struct memcg_cache_array *arr; rcu_read_lock(); - params = rcu_dereference(s->memcg_params); + arr = rcu_dereference(s->memcg_params.memcg_caches); /* * Make sure we will access the up-to-date value. The code updating * memcg_caches issues a write barrier to match this (see - * memcg_register_cache()). + * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(params->memcg_caches[idx]); + cachep = lockless_dereference(arr->entries[idx]); rcu_read_unlock(); return cachep; @@ -225,7 +215,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { if (is_root_cache(s)) return s; - return s->memcg_params->root_cache; + return s->memcg_params.root_cache; } static __always_inline int memcg_charge_slab(struct kmem_cache *s, @@ -235,7 +225,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); + return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -244,9 +234,13 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); + memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); } -#else + +extern void slab_init_memcg_params(struct kmem_cache *); + +#else /* !CONFIG_MEMCG_KMEM */ + static inline bool is_root_cache(struct kmem_cache *s) { return true; @@ -282,7 +276,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) { } -#endif + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 42bb22cb4219..4f1492a9e2da 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -106,62 +106,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) #endif #ifdef CONFIG_MEMCG_KMEM -static int memcg_alloc_cache_params(struct mem_cgroup *memcg, - struct kmem_cache *s, struct kmem_cache *root_cache) +void slab_init_memcg_params(struct kmem_cache *s) { - size_t size; + s->memcg_params.is_root_cache = true; + RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); +} + +static int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct memcg_cache_array *arr; - if (!memcg_kmem_enabled()) + if (memcg) { + s->memcg_params.is_root_cache = false; + s->memcg_params.memcg = memcg; + s->memcg_params.root_cache = root_cache; return 0; + } - if (!memcg) { - size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_nr_cache_ids * sizeof(void *); - } else - size = sizeof(struct memcg_cache_params); + slab_init_memcg_params(s); - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) - return -ENOMEM; + if (!memcg_nr_cache_ids) + return 0; - if (memcg) { - s->memcg_params->memcg = memcg; - s->memcg_params->root_cache = root_cache; - } else - s->memcg_params->is_root_cache = true; + arr = kzalloc(sizeof(struct memcg_cache_array) + + memcg_nr_cache_ids * sizeof(void *), + GFP_KERNEL); + if (!arr) + return -ENOMEM; + RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); return 0; } -static void memcg_free_cache_params(struct kmem_cache *s) +static void destroy_memcg_params(struct kmem_cache *s) { - kfree(s->memcg_params); + if (is_root_cache(s)) + kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); } -static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) +static int update_memcg_params(struct kmem_cache *s, int new_array_size) { - int size; - struct memcg_cache_params *new_params, *cur_params; + struct memcg_cache_array *old, *new; - BUG_ON(!is_root_cache(s)); - - size = offsetof(struct memcg_cache_params, memcg_caches); - size += num_memcgs * sizeof(void *); + if (!is_root_cache(s)) + return 0; - new_params = kzalloc(size, GFP_KERNEL); - if (!new_params) + old = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + new = kzalloc(sizeof(struct memcg_cache_array) + + new_array_size * sizeof(void *), GFP_KERNEL); + if (!new) return -ENOMEM; - cur_params = s->memcg_params; - memcpy(new_params->memcg_caches, cur_params->memcg_caches, + memcpy(new->entries, old->entries, memcg_nr_cache_ids * sizeof(void *)); - new_params->is_root_cache = true; - - rcu_assign_pointer(s->memcg_params, new_params); - if (cur_params) - kfree_rcu(cur_params, rcu_head); - + rcu_assign_pointer(s->memcg_params.memcg_caches, new); + if (old) + kfree_rcu(old, rcu); return 0; } @@ -172,10 +175,7 @@ int memcg_update_all_caches(int num_memcgs) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!is_root_cache(s)) - continue; - - ret = memcg_update_cache_params(s, num_memcgs); + ret = update_memcg_params(s, num_memcgs); /* * Instead of freeing the memory, we'll just leave the caches * up to this point in an updated state. @@ -187,13 +187,13 @@ int memcg_update_all_caches(int num_memcgs) return ret; } #else -static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, - struct kmem_cache *s, struct kmem_cache *root_cache) +static inline int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) { return 0; } -static inline void memcg_free_cache_params(struct kmem_cache *s) +static inline void destroy_memcg_params(struct kmem_cache *s) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -311,7 +311,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, s->align = align; s->ctor = ctor; - err = memcg_alloc_cache_params(memcg, s, root_cache); + err = init_memcg_params(s, memcg, root_cache); if (err) goto out_free_cache; @@ -327,7 +327,7 @@ out: return s; out_free_cache: - memcg_free_cache_params(s); + destroy_memcg_params(s); kfree(s); goto out; } @@ -439,11 +439,15 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, #ifdef CONFIG_MEMCG_KMEM if (!is_root_cache(s)) { - struct kmem_cache *root_cache = s->memcg_params->root_cache; - int memcg_id = memcg_cache_id(s->memcg_params->memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); - root_cache->memcg_params->memcg_caches[memcg_id] = NULL; + int idx; + struct memcg_cache_array *arr; + + idx = memcg_cache_id(s->memcg_params.memcg); + arr = rcu_dereference_protected(s->memcg_params.root_cache-> + memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + BUG_ON(arr->entries[idx] != s); + arr->entries[idx] = NULL; } #endif list_move(&s->list, release); @@ -481,27 +485,32 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ - int memcg_id = memcg_cache_id(memcg); + struct memcg_cache_array *arr; struct kmem_cache *s = NULL; char *cache_name; + int idx; get_online_cpus(); get_online_mems(); mutex_lock(&slab_mutex); + idx = memcg_cache_id(memcg); + arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + /* * Since per-memcg caches are created asynchronously on first * allocation (see memcg_kmem_get_cache()), several threads can try to * create the same cache, but only one of them may succeed. */ - if (cache_from_memcg_idx(root_cache, memcg_id)) + if (arr->entries[idx]) goto out_unlock; cgroup_name(mem_cgroup_css(memcg)->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name_buf); + idx, memcg_name_buf); if (!cache_name) goto out_unlock; @@ -525,7 +534,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * initialized. */ smp_wmb(); - root_cache->memcg_params->memcg_caches[memcg_id] = s; + arr->entries[idx] = s; out_unlock: mutex_unlock(&slab_mutex); @@ -545,7 +554,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) mutex_lock(&slab_mutex); list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params->memcg != memcg) + if (is_root_cache(s) || s->memcg_params.memcg != memcg) continue; /* * The cgroup is about to be freed and therefore has no charges @@ -564,7 +573,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) void slab_kmem_cache_release(struct kmem_cache *s) { - memcg_free_cache_params(s); + destroy_memcg_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } @@ -640,6 +649,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz s->name = name; s->size = s->object_size = size; s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + slab_init_memcg_params(s); + err = __kmem_cache_create(s, flags); if (err) @@ -980,7 +992,7 @@ int memcg_slab_show(struct seq_file *m, void *p) if (p == slab_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params->memcg == memcg) + if (!is_root_cache(s) && s->memcg_params.memcg == memcg) cache_show(s, m); return 0; } diff --git a/mm/slub.c b/mm/slub.c index 8b8508adf9c2..75d55fdfe3a1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3577,6 +3577,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) p->slab_cache = s; #endif } + slab_init_memcg_params(s); list_add(&s->list, &slab_caches); return s; } @@ -4964,7 +4965,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (is_root_cache(s)) return; - root_cache = s->memcg_params->root_cache; + root_cache = s->memcg_params.root_cache; /* * This mean this cache had no attribute written. Therefore, no point @@ -5044,7 +5045,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) { #ifdef CONFIG_MEMCG_KMEM if (!is_root_cache(s)) - return s->memcg_params->root_cache->memcg_kset; + return s->memcg_params.root_cache->memcg_kset; #endif return slab_kset; } -- cgit v1.2.3 From 068feec9a3f80b27022542e516e8f0a235c0d7e9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:11:59 +1100 Subject: slab: link memcg caches of the same kind into a list Sometimes, we need to iterate over all memcg copies of a particular root kmem cache. Currently, we use memcg_cache_params->memcg_caches array for that, because it contains all existing memcg caches. However, it's a bad practice to keep all caches, including those that belong to offline cgroups, in this array, because it will be growing beyond any bounds then. I'm going to wipe away dead caches from it to save space. To still be able to perform iterations over all memcg caches of the same kind, let us link them into a list. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Dave Chinner Signed-off-by: Andrew Morton --- include/linux/slab.h | 4 ++++ mm/slab.c | 13 +++++-------- mm/slab.h | 17 +++++++++++++++++ mm/slab_common.c | 21 ++++++++++----------- mm/slub.c | 19 +++++-------------- 5 files changed, 41 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 1e03c11bbfbd..26d99f41b410 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -491,9 +491,13 @@ struct memcg_cache_array { * * @memcg: pointer to the memcg this cache belongs to * @root_cache: pointer to the global, root cache, this cache was derived from + * + * Both root and child caches of the same kind are linked into a list chained + * through @list. */ struct memcg_cache_params { bool is_root_cache; + struct list_head list; union { struct memcg_cache_array __rcu *memcg_caches; struct { diff --git a/mm/slab.c b/mm/slab.c index 65b5dcb6f671..7894017bc160 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { int ret; - struct kmem_cache *c = NULL; - int i = 0; + struct kmem_cache *c; ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); @@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, if ((ret < 0) || !is_root_cache(cachep)) return ret; - VM_BUG_ON(!mutex_is_locked(&slab_mutex)); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(cachep, i); - if (c) - /* return value determined by the parent cache only */ - __do_tune_cpucache(c, limit, batchcount, shared, gfp); + lockdep_assert_held(&slab_mutex); + for_each_memcg_cache(c, cachep) { + /* return value determined by the root cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); } return ret; diff --git a/mm/slab.h b/mm/slab.h index 53a623f85931..2fc16c2ed198 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -163,6 +163,18 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); #ifdef CONFIG_MEMCG_KMEM +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.list, \ + memcg_params.list) + +#define for_each_memcg_cache_safe(iter, tmp, root) \ + list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ + memcg_params.list) + static inline bool is_root_cache(struct kmem_cache *s) { return s->memcg_params.is_root_cache; @@ -241,6 +253,11 @@ extern void slab_init_memcg_params(struct kmem_cache *); #else /* !CONFIG_MEMCG_KMEM */ +#define for_each_memcg_cache(iter, root) \ + for (iter = NULL, (root); 0; ) +#define for_each_memcg_cache_safe(iter, tmp, root) \ + for (iter = NULL, tmp = NULL, (root); 0; ) + static inline bool is_root_cache(struct kmem_cache *s) { return true; diff --git a/mm/slab_common.c b/mm/slab_common.c index 02e5850a9c77..6836833ee253 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -109,6 +109,7 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.is_root_cache = true; + INIT_LIST_HEAD(&s->memcg_params.list); RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); } @@ -449,6 +450,7 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, lockdep_is_held(&slab_mutex)); BUG_ON(arr->entries[idx] != s); arr->entries[idx] = NULL; + list_del(&s->memcg_params.list); } #endif list_move(&s->list, release); @@ -529,6 +531,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } + list_add(&s->memcg_params.list, &root_cache->memcg_params.list); + /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially @@ -581,11 +585,13 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int i; + struct kmem_cache *c, *c2; LIST_HEAD(release); bool need_rcu_barrier = false; bool busy = false; + BUG_ON(!is_root_cache(s)); + get_online_cpus(); get_online_mems(); @@ -595,10 +601,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - - if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + for_each_memcg_cache_safe(c, c2, s) { + if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) busy = true; } @@ -932,16 +936,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) { struct kmem_cache *c; struct slabinfo sinfo; - int i; if (!is_root_cache(s)) return; - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - + for_each_memcg_cache(c, s) { memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(c, &sinfo); diff --git a/mm/slub.c b/mm/slub.c index 75d55fdfe3a1..1e5a4636cb23 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3636,13 +3636,10 @@ struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s, *c; s = find_mergeable(size, align, flags, name, ctor); if (s) { - int i; - struct kmem_cache *c; - s->refcount++; /* @@ -3652,10 +3649,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; + for_each_memcg_cache(c, s) { c->object_size = s->object_size; c->inuse = max_t(int, c->inuse, ALIGN(size, sizeof(void *))); @@ -4921,7 +4915,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, err = attribute->store(s, buf, len); #ifdef CONFIG_MEMCG_KMEM if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - int i; + struct kmem_cache *c; mutex_lock(&slab_mutex); if (s->max_attr_size < len) @@ -4944,11 +4938,8 @@ static ssize_t slab_attr_store(struct kobject *kobj, * directly either failed or succeeded, in which case we loop * through the descendants with best-effort propagation. */ - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (c) - attribute->store(c, buf, len); - } + for_each_memcg_cache(c, s) + attribute->store(c, buf, len); mutex_unlock(&slab_mutex); } #endif -- cgit v1.2.3 From 9b043b936d2745628d9942d4c06669689274cea5 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:12:00 +1100 Subject: memcg: free memcg_caches slot on css offline We need to look up a kmem_cache in ->memcg_params.memcg_caches arrays only on allocations, so there is no need to have the array entries set until css free - we can clear them on css offline. This will allow us to reuse array entries more efficiently and avoid costly array relocations. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Dave Chinner Signed-off-by: Andrew Morton --- include/linux/slab.h | 10 +++++----- mm/memcontrol.c | 38 ++++++++++++++++++++++++++++++++------ mm/slab_common.c | 39 ++++++++++++++++++++++++++++----------- 3 files changed, 65 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 26d99f41b410..ed2ffaab59ea 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -115,13 +115,12 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); -#ifdef CONFIG_MEMCG_KMEM -void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); -void memcg_destroy_kmem_caches(struct mem_cgroup *); -#endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); -void kmem_cache_free(struct kmem_cache *, void *); + +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_deactivate_kmem_caches(struct mem_cgroup *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the @@ -288,6 +287,7 @@ static __always_inline int kmalloc_index(size_t size) void *__kmalloc(size_t size, gfp_t flags); void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); +void kmem_cache_free(struct kmem_cache *, void *); #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7ad37627dc6b..7423fe6c3a66 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -334,6 +334,7 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; + bool kmem_acct_active; #endif int last_scanned_node; @@ -354,7 +355,7 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM bool memcg_kmem_is_active(struct mem_cgroup *memcg) { - return memcg->kmemcg_id >= 0; + return memcg->kmem_acct_active; } #endif @@ -585,7 +586,7 @@ static void memcg_free_cache_id(int id); static void disarm_kmem_keys(struct mem_cgroup *memcg) { - if (memcg_kmem_is_active(memcg)) { + if (memcg->kmemcg_id >= 0) { static_key_slow_dec(&memcg_kmem_enabled_key); memcg_free_cache_id(memcg->kmemcg_id); } @@ -2666,6 +2667,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + int kmemcg_id; VM_BUG_ON(!is_root_cache(cachep)); @@ -2673,10 +2675,11 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) return cachep; memcg = get_mem_cgroup_from_mm(current->mm); - if (!memcg_kmem_is_active(memcg)) + kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); + if (kmemcg_id < 0) goto out; - memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); + memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); if (likely(memcg_cachep)) return memcg_cachep; @@ -3318,8 +3321,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, int err = 0; int memcg_id; - if (memcg_kmem_is_active(memcg)) - return 0; + BUG_ON(memcg->kmemcg_id >= 0); + BUG_ON(memcg->kmem_acct_active); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -3362,6 +3365,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, * patched. */ memcg->kmemcg_id = memcg_id; + memcg->kmem_acct_active = true; out: return err; } @@ -4041,6 +4045,22 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ + if (!memcg->kmem_acct_active) + return; + + /* + * Clear the 'active' flag before clearing memcg_caches arrays entries. + * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it + * guarantees no cache will be created for this cgroup after we are + * done (see memcg_create_kmem_cache()). + */ + memcg->kmem_acct_active = false; + + memcg_deactivate_kmem_caches(memcg); +} + static void memcg_destroy_kmem(struct mem_cgroup *memcg) { memcg_destroy_kmem_caches(memcg); @@ -4052,6 +4072,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ +} + static void memcg_destroy_kmem(struct mem_cgroup *memcg) { } @@ -4608,6 +4632,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) spin_unlock(&memcg->event_list_lock); vmpressure_cleanup(&memcg->vmpressure); + + memcg_deactivate_kmem(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/slab_common.c b/mm/slab_common.c index 87c8b124b6d8..4272a9057e65 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -440,18 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, *need_rcu_barrier = true; #ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) { - int idx; - struct memcg_cache_array *arr; - - idx = memcg_cache_id(s->memcg_params.memcg); - arr = rcu_dereference_protected(s->memcg_params.root_cache-> - memcg_params.memcg_caches, - lockdep_is_held(&slab_mutex)); - BUG_ON(arr->entries[idx] != s); - arr->entries[idx] = NULL; + if (!is_root_cache(s)) list_del(&s->memcg_params.list); - } #endif list_move(&s->list, release); return 0; @@ -499,6 +489,13 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + /* + * The memory cgroup could have been deactivated while the cache + * creation work was pending. + */ + if (!memcg_kmem_is_active(memcg)) + goto out_unlock; + idx = memcg_cache_id(memcg); arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, lockdep_is_held(&slab_mutex)); @@ -548,6 +545,26 @@ out_unlock: put_online_cpus(); } +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_cache_array *arr; + struct kmem_cache *s; + + idx = memcg_cache_id(memcg); + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + arr->entries[idx] = NULL; + } + mutex_unlock(&slab_mutex); +} + void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { LIST_HEAD(release); -- cgit v1.2.3 From 9b6c263799c80ddb7bcd10363c26b7700516c373 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:12:00 +1100 Subject: list_lru: add helpers to isolate items Currently, the isolate callback passed to the list_lru_walk family of functions is supposed to just delete an item from the list upon returning LRU_REMOVED or LRU_REMOVED_RETRY, while nr_items counter is fixed by __list_lru_walk_one after the callback returns. Since the callback is allowed to drop the lock after removing an item (it has to return LRU_REMOVED_RETRY then), the nr_items can be less than the actual number of elements on the list even if we check them under the lock. This makes it difficult to move items from one list_lru_one to another, which is required for per-memcg list_lru reparenting - we can't just splice the lists, we have to move entries one by one. This patch therefore introduces helpers that must be used by callback functions to isolate items instead of raw list_del/list_move. These are list_lru_isolate and list_lru_isolate_move. They not only remove the entry from the list, but also fix the nr_items counter, making sure nr_items always reflects the actual number of elements on the list if checked under the appropriate lock. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Dave Chinner Signed-off-by: Andrew Morton --- fs/dcache.c | 21 +++++++++++---------- fs/gfs2/quota.c | 5 +++-- fs/inode.c | 8 ++++---- fs/xfs/xfs_buf.c | 6 ++++-- fs/xfs/xfs_qm.c | 5 +++-- include/linux/list_lru.h | 9 +++++++-- mm/list_lru.c | 19 ++++++++++++++++--- mm/workingset.c | 3 ++- 8 files changed, 50 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/fs/dcache.c b/fs/dcache.c index 56c5da89f58a..d04be762b216 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -400,19 +400,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list) * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ -static void d_lru_isolate(struct dentry *dentry) +static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); - list_del_init(&dentry->d_lru); + list_lru_isolate(lru, &dentry->d_lru); } -static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, + struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; - list_move_tail(&dentry->d_lru, list); + list_lru_isolate_move(lru, &dentry->d_lru, list); } /* @@ -869,8 +870,8 @@ static void shrink_dentry_list(struct list_head *list) } } -static enum lru_status -dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +static enum lru_status dentry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -890,7 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * another pass through the LRU. */ if (dentry->d_lockref.count) { - d_lru_isolate(dentry); + d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } @@ -921,7 +922,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) return LRU_ROTATE; } - d_lru_shrink_move(dentry, freeable); + d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; @@ -951,7 +952,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, - spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -964,7 +965,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; - d_lru_shrink_move(dentry, freeable); + d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 56db71d5c95f..5073da38cf06 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list) } -static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) +static enum lru_status gfs2_qd_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); @@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); - list_move(&qd->qd_lru, dispose); + list_lru_isolate_move(lru, &qd->qd_lru, dispose); } spin_unlock(&qd->qd_lockref.lock); diff --git a/fs/inode.c b/fs/inode.c index b80b17a09d36..198dbcd6554a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -684,8 +684,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -static enum lru_status -inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +static enum lru_status inode_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); @@ -703,7 +703,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); + list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; @@ -737,7 +737,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - list_move(&inode->i_lru, freeable); + list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15c9d224c721..1790b00bea7a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1488,6 +1488,7 @@ xfs_buf_iomove( static enum lru_status xfs_buftarg_wait_rele( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) @@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele( */ atomic_set(&bp->b_lru_ref, 0); bp->b_state |= XFS_BSTATE_DISPOSE; - list_move(item, dispose); + list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lock); return LRU_REMOVED; } @@ -1546,6 +1547,7 @@ xfs_wait_buftarg( static enum lru_status xfs_buftarg_isolate( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -1569,7 +1571,7 @@ xfs_buftarg_isolate( } bp->b_state |= XFS_BSTATE_DISPOSE; - list_move(item, dispose); + list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lock); return LRU_REMOVED; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d77bf6d8312a..3bd04531a349 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -430,6 +430,7 @@ struct xfs_qm_isolate { static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __releases(lru_lock) __acquires(lru_lock) @@ -450,7 +451,7 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); - list_del_init(&dqp->q_lru); + list_lru_isolate(lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); return LRU_REMOVED; } @@ -494,7 +495,7 @@ xfs_qm_dquot_isolate( xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, &isol->dispose); + list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); XFS_STATS_INC(xs_qm_dqreclaims); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 305b598abac2..7edf9c9ab9eb 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -125,8 +125,13 @@ static inline unsigned long list_lru_count(struct list_lru *lru) return count; } -typedef enum lru_status -(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg); +void list_lru_isolate(struct list_lru_one *list, struct list_head *item); +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head); + +typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, + struct list_lru_one *list, spinlock_t *lock, void *cb_arg); + /** * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. diff --git a/mm/list_lru.c b/mm/list_lru.c index 79aee70c3b9d..8d9d168c6c38 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -132,6 +132,21 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); +void list_lru_isolate(struct list_lru_one *list, struct list_head *item) +{ + list_del_init(item); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate); + +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head) +{ + list_move(item, head); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate_move); + static unsigned long __list_lru_count_one(struct list_lru *lru, int nid, int memcg_idx) { @@ -194,13 +209,11 @@ restart: break; --*nr_to_walk; - ret = isolate(item, &nlru->lock, cb_arg); + ret = isolate(item, l, &nlru->lock, cb_arg); switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); case LRU_REMOVED: - l->nr_items--; - WARN_ON_ONCE(l->nr_items < 0); isolated++; /* * If the lru lock has been dropped, our list diff --git a/mm/workingset.c b/mm/workingset.c index d4fa7fb10a52..aa017133744b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } static enum lru_status shadow_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } - list_del_init(item); + list_lru_isolate(lru, item); spin_unlock(lru_lock); /* -- cgit v1.2.3 From 38ad43532e115fba53443f91903608f5eae63339 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 30 Jan 2015 13:12:00 +1100 Subject: memcg: reparent list_lrus and free kmemcg_id on css offline Now, the only reason to keep kmemcg_id till css free is list_lru, which uses it to distribute elements between per-memcg lists. However, it can be easily sorted out - we only need to change kmemcg_id of an offline cgroup to its parent's id, making further list_lru_add()'s add elements to the parent's list, and then move all elements from the offline cgroup's list to the one of its parent. It will work, because a racing list_lru_del() does not need to know the list it is deleting the element from. It can decrement the wrong nr_items counter though, but the ongoing reparenting will fix it. After list_lru reparenting is done we are free to release kmemcg_id saving a valuable slot in a per-memcg array for new cgroups. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Dave Chinner Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 3 ++- mm/list_lru.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- mm/memcontrol.c | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 79 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 7edf9c9ab9eb..2a6b9947aaa3 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -26,7 +26,7 @@ enum lru_status { struct list_lru_one { struct list_head list; - /* kept as signed so we can catch imbalance bugs */ + /* may become negative during memcg reparenting */ long nr_items; }; @@ -62,6 +62,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL) int memcg_update_all_list_lrus(int num_memcgs); +void memcg_drain_all_list_lrus(int src_idx, int dst_idx); /** * list_lru_add: add an element to the lru list's tail diff --git a/mm/list_lru.c b/mm/list_lru.c index 8d9d168c6c38..909eca2c820e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -100,7 +100,6 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); l = list_lru_from_kmem(nlru, item); - WARN_ON_ONCE(l->nr_items < 0); if (list_empty(item)) { list_add_tail(item, &l->list); l->nr_items++; @@ -123,7 +122,6 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) if (!list_empty(item)) { list_del_init(item); l->nr_items--; - WARN_ON_ONCE(l->nr_items < 0); spin_unlock(&nlru->lock); return true; } @@ -156,7 +154,6 @@ static unsigned long __list_lru_count_one(struct list_lru *lru, spin_lock(&nlru->lock); l = list_lru_from_memcg_idx(nlru, memcg_idx); - WARN_ON_ONCE(l->nr_items < 0); count = l->nr_items; spin_unlock(&nlru->lock); @@ -458,6 +455,49 @@ fail: memcg_cancel_update_list_lru(lru, old_size, new_size); goto out; } + +static void memcg_drain_list_lru_node(struct list_lru_node *nlru, + int src_idx, int dst_idx) +{ + struct list_lru_one *src, *dst; + + /* + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + + src = list_lru_from_memcg_idx(nlru, src_idx); + dst = list_lru_from_memcg_idx(nlru, dst_idx); + + list_splice_init(&src->list, &dst->list); + dst->nr_items += src->nr_items; + src->nr_items = 0; + + spin_unlock_irq(&nlru->lock); +} + +static void memcg_drain_list_lru(struct list_lru *lru, + int src_idx, int dst_idx) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); +} + +void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +{ + struct list_lru *lru; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) + memcg_drain_list_lru(lru, src_idx, dst_idx); + mutex_unlock(&list_lrus_mutex); +} #else static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7423fe6c3a66..3b2cc3a5413a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -334,6 +334,7 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; + bool kmem_acct_activated; bool kmem_acct_active; #endif @@ -582,14 +583,10 @@ void memcg_put_cache_ids(void) struct static_key memcg_kmem_enabled_key; EXPORT_SYMBOL(memcg_kmem_enabled_key); -static void memcg_free_cache_id(int id); - static void disarm_kmem_keys(struct mem_cgroup *memcg) { - if (memcg->kmemcg_id >= 0) { + if (memcg->kmem_acct_activated) static_key_slow_dec(&memcg_kmem_enabled_key); - memcg_free_cache_id(memcg->kmemcg_id); - } /* * This check can't live in kmem destruction function, * since the charges will outlive the cgroup @@ -3322,6 +3319,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); + BUG_ON(memcg->kmem_acct_activated); BUG_ON(memcg->kmem_acct_active); /* @@ -3365,6 +3363,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, * patched. */ memcg->kmemcg_id = memcg_id; + memcg->kmem_acct_activated = true; memcg->kmem_acct_active = true; out: return err; @@ -4047,6 +4046,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_deactivate_kmem(struct mem_cgroup *memcg) { + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + if (!memcg->kmem_acct_active) return; @@ -4059,6 +4062,32 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg) memcg->kmem_acct_active = false; memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). + */ + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); } static void memcg_destroy_kmem(struct mem_cgroup *memcg) -- cgit v1.2.3 From d457b3c31b2a0af611db6845e3309786215a2909 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 30 Jan 2015 13:12:01 +1100 Subject: mm: when stealing freepages, also take pages created by splitting buddy page When studying page stealing, I noticed some weird looking decisions in try_to_steal_freepages(). The first I assume is a bug (Patch 1), the following two patches were driven by evaluation. Testing was done with stress-highalloc of mmtests, using the mm_page_alloc_extfrag tracepoint and postprocessing to get counts of how often page stealing occurs for individual migratetypes, and what migratetypes are used for fallbacks. Arguably, the worst case of page stealing is when UNMOVABLE allocation steals from MOVABLE pageblock. RECLAIMABLE allocation stealing from MOVABLE allocation is also not ideal, so the goal is to minimize these two cases. The evaluation of v2 wasn't always clear win and Joonsoo questioned the results. Here I used different baseline which includes RFC compaction improvements from [1]. I found that the compaction improvements reduce variability of stress-highalloc, so there's less noise in the data. First, let's look at stress-highalloc configured to do sync compaction, and how these patches reduce page stealing events during the test. First column is after fresh reboot, other two are reiterations of test without reboot. That was all accumulater over 5 re-iterations (so the benchmark was run 5x3 times with 5 fresh restarts). Baseline: 3.19-rc4 3.19-rc4 3.19-rc4 5-nothp-1 5-nothp-2 5-nothp-3 Page alloc extfrag event 10264225 8702233 10244125 Extfrag fragmenting 10263271 8701552 10243473 Extfrag fragmenting for unmovable 13595 17616 15960 Extfrag fragmenting unmovable placed with movable 7989 12193 8447 Extfrag fragmenting for reclaimable 658 1840 1817 Extfrag fragmenting reclaimable placed with movable 558 1677 1679 Extfrag fragmenting for movable 10249018 8682096 10225696 With Patch 1: 3.19-rc4 3.19-rc4 3.19-rc4 6-nothp-1 6-nothp-2 6-nothp-3 Page alloc extfrag event 11834954 9877523 9774860 Extfrag fragmenting 11833993 9876880 9774245 Extfrag fragmenting for unmovable 7342 16129 11712 Extfrag fragmenting unmovable placed with movable 4191 10547 6270 Extfrag fragmenting for reclaimable 373 1130 923 Extfrag fragmenting reclaimable placed with movable 302 906 738 Extfrag fragmenting for movable 11826278 9859621 9761610 With Patch 2: 3.19-rc4 3.19-rc4 3.19-rc4 7-nothp-1 7-nothp-2 7-nothp-3 Page alloc extfrag event 4725990 3668793 3807436 Extfrag fragmenting 4725104 3668252 3806898 Extfrag fragmenting for unmovable 6678 7974 7281 Extfrag fragmenting unmovable placed with movable 2051 3829 4017 Extfrag fragmenting for reclaimable 429 1208 1278 Extfrag fragmenting reclaimable placed with movable 369 976 1034 Extfrag fragmenting for movable 4717997 3659070 3798339 With Patch 3: 3.19-rc4 3.19-rc4 3.19-rc4 8-nothp-1 8-nothp-2 8-nothp-3 Page alloc extfrag event 5016183 4700142 3850633 Extfrag fragmenting 5015325 4699613 3850072 Extfrag fragmenting for unmovable 1312 3154 3088 Extfrag fragmenting unmovable placed with movable 1115 2777 2714 Extfrag fragmenting for reclaimable 437 1193 1097 Extfrag fragmenting reclaimable placed with movable 330 969 879 Extfrag fragmenting for movable 5013576 4695266 3845887 In v2 we've seen apparent regression with Patch 1 for unmovable events, this is now gone, suggesting it was indeed noise. Here, each patch improves the situation for unmovable events. Reclaimable is improved by patch 1 and then either the same modulo noise, or perhaps sligtly worse - a small price for unmovable improvements, IMHO. The number of movable allocations falling back to other migratetypes is most noisy, but it's reduced to half at Patch 2 nevertheless. These are least critical as compaction can move them around. If we look at success rates, the patches don't affect them, that didn't change. Baseline: 3.19-rc4 3.19-rc4 3.19-rc4 5-nothp-1 5-nothp-2 5-nothp-3 Success 1 Min 49.00 ( 0.00%) 42.00 ( 14.29%) 41.00 ( 16.33%) Success 1 Mean 51.00 ( 0.00%) 45.00 ( 11.76%) 42.60 ( 16.47%) Success 1 Max 55.00 ( 0.00%) 51.00 ( 7.27%) 46.00 ( 16.36%) Success 2 Min 53.00 ( 0.00%) 47.00 ( 11.32%) 44.00 ( 16.98%) Success 2 Mean 59.60 ( 0.00%) 50.80 ( 14.77%) 48.20 ( 19.13%) Success 2 Max 64.00 ( 0.00%) 56.00 ( 12.50%) 52.00 ( 18.75%) Success 3 Min 84.00 ( 0.00%) 82.00 ( 2.38%) 78.00 ( 7.14%) Success 3 Mean 85.60 ( 0.00%) 82.80 ( 3.27%) 79.40 ( 7.24%) Success 3 Max 86.00 ( 0.00%) 83.00 ( 3.49%) 80.00 ( 6.98%) Patch 1: 3.19-rc4 3.19-rc4 3.19-rc4 6-nothp-1 6-nothp-2 6-nothp-3 Success 1 Min 49.00 ( 0.00%) 44.00 ( 10.20%) 44.00 ( 10.20%) Success 1 Mean 51.80 ( 0.00%) 46.00 ( 11.20%) 45.80 ( 11.58%) Success 1 Max 54.00 ( 0.00%) 49.00 ( 9.26%) 49.00 ( 9.26%) Success 2 Min 58.00 ( 0.00%) 49.00 ( 15.52%) 48.00 ( 17.24%) Success 2 Mean 60.40 ( 0.00%) 51.80 ( 14.24%) 50.80 ( 15.89%) Success 2 Max 63.00 ( 0.00%) 54.00 ( 14.29%) 55.00 ( 12.70%) Success 3 Min 84.00 ( 0.00%) 81.00 ( 3.57%) 79.00 ( 5.95%) Success 3 Mean 85.00 ( 0.00%) 81.60 ( 4.00%) 79.80 ( 6.12%) Success 3 Max 86.00 ( 0.00%) 82.00 ( 4.65%) 82.00 ( 4.65%) Patch 2: 3.19-rc4 3.19-rc4 3.19-rc4 7-nothp-1 7-nothp-2 7-nothp-3 Success 1 Min 50.00 ( 0.00%) 44.00 ( 12.00%) 39.00 ( 22.00%) Success 1 Mean 52.80 ( 0.00%) 45.60 ( 13.64%) 42.40 ( 19.70%) Success 1 Max 55.00 ( 0.00%) 46.00 ( 16.36%) 47.00 ( 14.55%) Success 2 Min 52.00 ( 0.00%) 48.00 ( 7.69%) 45.00 ( 13.46%) Success 2 Mean 53.40 ( 0.00%) 49.80 ( 6.74%) 48.80 ( 8.61%) Success 2 Max 57.00 ( 0.00%) 52.00 ( 8.77%) 52.00 ( 8.77%) Success 3 Min 84.00 ( 0.00%) 81.00 ( 3.57%) 79.00 ( 5.95%) Success 3 Mean 85.00 ( 0.00%) 82.40 ( 3.06%) 79.60 ( 6.35%) Success 3 Max 86.00 ( 0.00%) 83.00 ( 3.49%) 80.00 ( 6.98%) Patch 3: 3.19-rc4 3.19-rc4 3.19-rc4 8-nothp-1 8-nothp-2 8-nothp-3 Success 1 Min 46.00 ( 0.00%) 44.00 ( 4.35%) 42.00 ( 8.70%) Success 1 Mean 50.20 ( 0.00%) 45.60 ( 9.16%) 44.00 ( 12.35%) Success 1 Max 52.00 ( 0.00%) 47.00 ( 9.62%) 47.00 ( 9.62%) Success 2 Min 53.00 ( 0.00%) 49.00 ( 7.55%) 48.00 ( 9.43%) Success 2 Mean 55.80 ( 0.00%) 50.60 ( 9.32%) 49.00 ( 12.19%) Success 2 Max 59.00 ( 0.00%) 52.00 ( 11.86%) 51.00 ( 13.56%) Success 3 Min 84.00 ( 0.00%) 80.00 ( 4.76%) 79.00 ( 5.95%) Success 3 Mean 85.40 ( 0.00%) 81.60 ( 4.45%) 80.40 ( 5.85%) Success 3 Max 87.00 ( 0.00%) 83.00 ( 4.60%) 82.00 ( 5.75%) While there's no improvement here, I consider reduced fragmentation events to be worth on its own. Patch 2 also seems to reduce scanning for free pages, and migrations in compaction, suggesting it has somewhat less work to do: Patch 1: Compaction stalls 4153 3959 3978 Compaction success 1523 1441 1446 Compaction failures 2630 2517 2531 Page migrate success 4600827 4943120 5104348 Page migrate failure 19763 16656 17806 Compaction pages isolated 9597640 10305617 10653541 Compaction migrate scanned 77828948 86533283 87137064 Compaction free scanned 517758295 521312840 521462251 Compaction cost 5503 5932 6110 Patch 2: Compaction stalls 3800 3450 3518 Compaction success 1421 1316 1317 Compaction failures 2379 2134 2201 Page migrate success 4160421 4502708 4752148 Page migrate failure 19705 14340 14911 Compaction pages isolated 8731983 9382374 9910043 Compaction migrate scanned 98362797 96349194 98609686 Compaction free scanned 496512560 469502017 480442545 Compaction cost 5173 5526 5811 As with v2, /proc/pagetypeinfo appears unaffected with respect to numbers of unmovable and reclaimable pageblocks. Configuring the benchmark to allocate like THP page fault (i.e. no sync compaction) gives much noisier results for iterations 2 and 3 after reboot. This is not so surprising given how [1] offers lower improvements in this scenario due to less restarts after deferred compaction which would change compaction pivot. Baseline: 3.19-rc4 3.19-rc4 3.19-rc4 5-thp-1 5-thp-2 5-thp-3 Page alloc extfrag event 8148965 6227815 6646741 Extfrag fragmenting 8147872 6227130 6646117 Extfrag fragmenting for unmovable 10324 12942 15975 Extfrag fragmenting unmovable placed with movable 5972 8495 10907 Extfrag fragmenting for reclaimable 601 1707 2210 Extfrag fragmenting reclaimable placed with movable 520 1570 2000 Extfrag fragmenting for movable 8136947 6212481 6627932 Patch 1: 3.19-rc4 3.19-rc4 3.19-rc4 6-thp-1 6-thp-2 6-thp-3 Page alloc extfrag event 8345457 7574471 7020419 Extfrag fragmenting 8343546 7573777 7019718 Extfrag fragmenting for unmovable 10256 18535 30716 Extfrag fragmenting unmovable placed with movable 6893 11726 22181 Extfrag fragmenting for reclaimable 465 1208 1023 Extfrag fragmenting reclaimable placed with movable 353 996 843 Extfrag fragmenting for movable 8332825 7554034 6987979 Patch 2: 3.19-rc4 3.19-rc4 3.19-rc4 7-thp-1 7-thp-2 7-thp-3 Page alloc extfrag event 3512847 3020756 2891625 Extfrag fragmenting 3511940 3020185 2891059 Extfrag fragmenting for unmovable 9017 6892 6191 Extfrag fragmenting unmovable placed with movable 1524 3053 2435 Extfrag fragmenting for reclaimable 445 1081 1160 Extfrag fragmenting reclaimable placed with movable 375 918 986 Extfrag fragmenting for movable 3502478 3012212 2883708 Patch 3: 3.19-rc4 3.19-rc4 3.19-rc4 8-thp-1 8-thp-2 8-thp-3 Page alloc extfrag event 3181699 3082881 2674164 Extfrag fragmenting 3180812 3082303 2673611 Extfrag fragmenting for unmovable 1201 4031 4040 Extfrag fragmenting unmovable placed with movable 974 3611 3645 Extfrag fragmenting for reclaimable 478 1165 1294 Extfrag fragmenting reclaimable placed with movable 387 985 1030 Extfrag fragmenting for movable 3179133 3077107 2668277 The improvements for first iteration are clear, the rest is much noisier and can appear like regression for Patch 1. Anyway, patch 2 rectifies it. Allocation success rates are again unaffected so there's no point in making this e-mail any longer. [1] http://marc.info/?l=linux-mm&m=142166196321125&w=2 This patch (of 3): When __rmqueue_fallback() is called to allocate a page of order X, it will find a page of order Y >= X of a fallback migratetype, which is different from the desired migratetype. With the help of try_to_steal_freepages(), it may change the migratetype (to the desired one) also of: 1) all currently free pages in the pageblock containing the fallback page 2) the fallback pageblock itself 3) buddy pages created by splitting the fallback page (when Y > X) These decisions take the order Y into account, as well as the desired migratetype, with the goal of preventing multiple fallback allocations that could e.g. distribute UNMOVABLE allocations among multiple pageblocks. Originally, decision for 1) has implied the decision for 3). Commit 47118af076f6 ("mm: mmzone: MIGRATE_CMA migration type added") changed that (probably unintentionally) so that the buddy pages in case 3) are always changed to the desired migratetype, except for CMA pageblocks. Commit fef903efcf0c ("mm/page_allo.c: restructure free-page stealing code and fix a bug") did some refactoring and added a comment that the case of 3) is intended. Commit 0cbef29a7821 ("mm: __rmqueue_fallback() should respect pageblock type") removed the comment and tried to restore the original behavior where 1) implies 3), but due to the previous refactoring, the result is instead that only 2) implies 3) - and the conditions for 2) are less frequently met than conditions for 1). This may increase fragmentation in situations where the code decides to steal all free pages from the pageblock (case 1)), but then gives back the buddy pages produced by splitting. This patch restores the original intended logic where 1) implies 3). During testing with stress-highalloc from mmtests, this has shown to decrease the number of events where UNMOVABLE and RECLAIMABLE allocations steal from MOVABLE pageblocks, which can lead to permanent fragmentation. In some cases it has increased the number of events when MOVABLE allocations steal from UNMOVABLE or RECLAIMABLE pageblocks, but these are fixable by sync compaction and thus less harmful. Note that evaluation has shown that the behavior introduced by 47118af076f6 for buddy pages in case 3) is actually even better than the original logic, so the following patch will introduce it properly once again. For stable backports of this patch it makes thus sense to only fix versions containing 0cbef29a7821. [iamjoonsoo.kim@lge.com: tracepoint fix] Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Zhang Yanfei Acked-by: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: [3.13+ containing 0cbef29a7821] Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 7 ++++--- mm/page_alloc.c | 12 +++++------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index aece1346ceb7..4ad10baecd4d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -268,11 +268,11 @@ TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, - int alloc_migratetype, int fallback_migratetype, int new_migratetype), + int alloc_migratetype, int fallback_migratetype), TP_ARGS(page, alloc_order, fallback_order, - alloc_migratetype, fallback_migratetype, new_migratetype), + alloc_migratetype, fallback_migratetype), TP_STRUCT__entry( __field( struct page *, page ) @@ -289,7 +289,8 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; - __entry->change_ownership = (new_migratetype == alloc_migratetype); + __entry->change_ownership = (alloc_migratetype == + get_pageblock_migratetype(page)); ), TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 134e25525044..b7a881019352 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1131,8 +1131,8 @@ static void change_pageblock_range(struct page *pageblock_page, * nor move CMA pages to different free lists. We don't want unmovable pages * to be allocated from MIGRATE_CMA areas. * - * Returns the new migratetype of the pageblock (or the same old migratetype - * if it was unchanged). + * Returns the allocation migratetype if free pages were stolen, or the + * fallback migratetype if it was decided not to steal. */ static int try_to_steal_freepages(struct zone *zone, struct page *page, int start_type, int fallback_type) @@ -1163,12 +1163,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) { - + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); - return start_type; - } + return start_type; } return fallback_type; @@ -1220,7 +1218,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype, new_type); + start_migratetype, migratetype); return page; } -- cgit v1.2.3 From 5010f2b28f8f057de4a81cdd31f0bade3836a58d Mon Sep 17 00:00:00 2001 From: Sergei Rogachev Date: Fri, 30 Jan 2015 13:12:02 +1100 Subject: mm/page_owner.c: remove unnecessary stack_trace field Page owner uses the page_ext structure to keep meta-information for every page in the system. The structure also contains a field of type 'struct stack_trace', page owner uses this field during invocation of the function save_stack_trace. It is easy to notice that keeping a copy of this structure for every page in the system is very inefficiently in terms of memory. The patch removes this unnecessary field of page_ext and forces page owner to use a stack_trace structure allocated on the stack. Signed-off-by: Sergei Rogachev Acked-by: Joonsoo Kim Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 2 +- mm/page_owner.c | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index d2a2c84c72d0..c42981cd99aa 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -40,7 +40,7 @@ struct page_ext { #ifdef CONFIG_PAGE_OWNER unsigned int order; gfp_t gfp_mask; - struct stack_trace trace; + unsigned int nr_entries; unsigned long trace_entries[8]; #endif }; diff --git a/mm/page_owner.c b/mm/page_owner.c index 9ab4a9b5bc09..f7f33184aa24 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -60,19 +60,19 @@ void __reset_page_owner(struct page *page, unsigned int order) void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext; - struct stack_trace *trace; + struct stack_trace trace; page_ext = lookup_page_ext(page); - trace = &page_ext->trace; - trace->nr_entries = 0; - trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); - trace->entries = &page_ext->trace_entries[0]; - trace->skip = 3; - save_stack_trace(&page_ext->trace); + trace.nr_entries = 0; + trace.max_entries = ARRAY_SIZE(page_ext->trace_entries); + trace.entries = &page_ext->trace_entries[0]; + trace.skip = 3; + save_stack_trace(&trace); page_ext->order = order; page_ext->gfp_mask = gfp_mask; + page_ext->nr_entries = trace.nr_entries; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } @@ -83,6 +83,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, { int ret; int pageblock_mt, page_mt; + struct stack_trace trace; char *kbuf; kbuf = kmalloc(count, GFP_KERNEL); @@ -121,8 +122,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - ret += snprint_stack_trace(kbuf + ret, count - ret, - &page_ext->trace, 0); + trace.nr_entries = page_ext->nr_entries; + trace.entries = &page_ext->trace_entries[0]; + + ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); if (ret >= count) goto err; -- cgit v1.2.3 From 34af7be9b6759ef0eccf9e5a976b4e1eb5349d0f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 30 Jan 2015 13:12:04 +1100 Subject: mm: support madvise(MADV_FREE) Linux doesn't have an ability to free pages lazy while other OS already have been supported that named by madvise(MADV_FREE). The gain is clear that kernel can discard freed pages rather than swapping out or OOM if memory pressure happens. Without memory pressure, freed pages would be reused by userspace without another additional overhead(ex, page fault + allocation + zeroing). How to work is following as. When madvise syscall is called, VM clears dirty bit of ptes of the range. If memory pressure happens, VM checks dirty bit of page table and if it found still "clean", it means it's a "lazyfree pages" so VM could discard the page instead of swapping out. Once there was store operation for the page before VM peek a page to reclaim, dirty bit is set so VM can swap out the page instead of discarding. Firstly, heavy users would be general allocators(ex, jemalloc, tcmalloc and hope glibc supports it) and jemalloc/tcmalloc already have supported the feature for other OS(ex, FreeBSD) barrios@blaptop:~/benchmark/ebizzy$ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 12 On-line CPU(s) list: 0-11 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 12 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 2 Stepping: 3 CPU MHz: 3200.185 BogoMIPS: 6400.53 Virtualization: VT-x Hypervisor vendor: KVM Virtualization type: full L1d cache: 32K L1i cache: 32K L2 cache: 4096K NUMA node0 CPU(s): 0-11 ebizzy benchmark(./ebizzy -S 10 -n 512) Higher avg is better. vanilla-jemalloc MADV_free-jemalloc 1 thread records: 10 records: 10 avg: 2961.90 avg: 12069.70 std: 71.96(2.43%) std: 186.68(1.55%) max: 3070.00 max: 12385.00 min: 2796.00 min: 11746.00 2 thread records: 10 records: 10 avg: 5020.00 avg: 17827.00 std: 264.87(5.28%) std: 358.52(2.01%) max: 5244.00 max: 18760.00 min: 4251.00 min: 17382.00 4 thread records: 10 records: 10 avg: 8988.80 avg: 27930.80 std: 1175.33(13.08%) std: 3317.33(11.88%) max: 9508.00 max: 30879.00 min: 5477.00 min: 21024.00 8 thread records: 10 records: 10 avg: 13036.50 avg: 33739.40 std: 170.67(1.31%) std: 5146.22(15.25%) max: 13371.00 max: 40572.00 min: 12785.00 min: 24088.00 16 thread records: 10 records: 10 avg: 11092.40 avg: 31424.20 std: 710.60(6.41%) std: 3763.89(11.98%) max: 12446.00 max: 36635.00 min: 9949.00 min: 25669.00 32 thread records: 10 records: 10 avg: 11067.00 avg: 34495.80 std: 971.06(8.77%) std: 2721.36(7.89%) max: 12010.00 max: 38598.00 min: 9002.00 min: 30636.00 In summary, MADV_FREE is about much faster than MADV_DONTNEED. Signed-off-by: Minchan Kim Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/rmap.h | 9 ++- include/linux/vm_event_item.h | 1 + include/uapi/asm-generic/mman-common.h | 1 + mm/madvise.c | 140 +++++++++++++++++++++++++++++++++ mm/rmap.c | 42 +++++++++- mm/vmscan.c | 40 ++++++++-- mm/vmstat.c | 1 + 7 files changed, 222 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b38f559130d5..dbcd5ec3f291 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,6 +85,7 @@ enum ttu_flags { TTU_UNMAP = 1, /* unmap mode */ TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ + TTU_FREE = 8, /* free mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -191,7 +192,8 @@ static inline void page_dup_rmap(struct page *page) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags, + int *is_pte_dirty); #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) @@ -268,9 +270,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc); static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + int *is_pte_dirty) { *vm_flags = 0; + if (is_pte_dirty) + *is_pte_dirty = 0; return 0; } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9246d32dc973..2b1cef88b827 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, + PGLAZYFREED, FOR_ALL_ZONES(PGREFILL), FOR_ALL_ZONES(PGSTEAL_KSWAPD), FOR_ALL_ZONES(PGSTEAL_DIRECT), diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ddc3b36f1046..7a94102b7a02 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -34,6 +34,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/mm/madvise.c b/mm/madvise.c index d79fb5e8f80a..0f9e518ad8a5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -19,6 +19,14 @@ #include #include #include +#include + +#include + +struct madvise_free_private { + struct vm_area_struct *vma; + struct mmu_gather *tlb; +}; /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -31,6 +39,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -251,6 +260,128 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct madvise_free_private *fp = walk->private; + struct mmu_gather *tlb = fp->tlb; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = fp->vma; + spinlock_t *ptl; + pte_t *pte, ptent; + struct page *page; + + split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + if (PageSwapCache(page)) { + if (!trylock_page(page)) + continue; + + if (!try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_free_private fp = { + .vma = vma, + .tlb = tlb, + }; + + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = &fp, + }; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (vma->vm_file) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -374,6 +505,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -393,6 +532,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/rmap.c b/mm/rmap.c index 5e3e09081164..47facbc6f99e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -705,6 +705,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) } struct page_referenced_arg { + int dirtied; int mapcount; int referenced; unsigned long vm_flags; @@ -719,6 +720,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + int dirty = 0; struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { @@ -742,6 +744,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; + + /* + * In this implmentation, MADV_FREE doesn't support THP free + */ + dirty++; spin_unlock(ptl); } else { pte_t *pte; @@ -771,6 +778,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } + + if (pte_dirty(*pte)) + dirty++; + pte_unmap_unlock(pte, ptl); } @@ -779,6 +790,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pra->vm_flags |= vma->vm_flags; } + if (dirty) + pra->dirtied++; + pra->mapcount--; if (!pra->mapcount) return SWAP_SUCCESS; /* To break the loop */ @@ -803,6 +817,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) * @is_locked: caller holds lock on the page * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page + * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. @@ -810,7 +825,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + int *is_pte_dirty) { int ret; int we_locked = 0; @@ -825,6 +841,9 @@ int page_referenced(struct page *page, }; *vm_flags = 0; + if (is_pte_dirty) + *is_pte_dirty = 0; + if (!page_mapped(page)) return 0; @@ -852,6 +871,9 @@ int page_referenced(struct page *page, if (we_locked) unlock_page(page); + if (is_pte_dirty) + *is_pte_dirty = pra.dirtied; + return pra.referenced; } @@ -1180,6 +1202,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spinlock_t *ptl; int ret = SWAP_AGAIN; enum ttu_flags flags = (enum ttu_flags)arg; + int dirty = 0; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1209,7 +1232,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) + dirty = pte_dirty(pteval); + if (dirty) set_page_dirty(page); /* Update high watermark before we lower rss */ @@ -1238,6 +1262,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; + if (flags & TTU_FREE) { + VM_BUG_ON_PAGE(PageSwapCache(page), page); + if (!dirty && !PageDirty(page)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } else { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + } + if (PageSwapCache(page)) { /* * Store the swap location in the pte. @@ -1278,6 +1315,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, MM_FILEPAGES); +discard: page_remove_rmap(page); page_cache_release(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 175a145bd73a..0c89c6ca7f76 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -754,13 +754,17 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, - struct scan_control *sc) + struct scan_control *sc, + bool *freeable) { int referenced_ptes, referenced_page; unsigned long vm_flags; + int pte_dirty; + + VM_BUG_ON_PAGE(!PageLocked(page), page); referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); + &vm_flags, &pte_dirty); referenced_page = TestClearPageReferenced(page); /* @@ -801,6 +805,10 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_KEEP; } + if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) && + !PageDirty(page)) + *freeable = true; + /* Reclaim if clean, defer dirty pages to writeback */ if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; @@ -869,6 +877,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool freeable = false; cond_resched(); @@ -992,7 +1001,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (!force_reclaim) - references = page_check_references(page, sc); + references = page_check_references(page, sc, + &freeable); switch (references) { case PAGEREF_ACTIVATE: @@ -1008,7 +1018,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && !PageSwapCache(page) && !freeable) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (!add_to_swap(page, page_list)) @@ -1023,8 +1033,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + if (page_mapped(page) && (mapping || freeable)) { + switch (try_to_unmap(page, + freeable ? TTU_FREE : ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1032,7 +1043,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, case SWAP_MLOCK: goto cull_mlocked; case SWAP_SUCCESS: - ; /* try to free the page below */ + /* try to free the page below */ + if (!freeable) + break; + /* + * Freeable anon page doesn't have mapping + * due to skipping of swapcache so we free + * page in here rather than __remove_mapping. + */ + VM_BUG_ON_PAGE(PageSwapCache(page), page); + if (!page_freeze_refs(page, 1)) + goto keep_locked; + __clear_page_locked(page); + count_vm_event(PGLAZYFREED); + goto free_it; } } @@ -1789,7 +1813,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags, NULL)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and diff --git a/mm/vmstat.c b/mm/vmstat.c index 4f5cd974e11a..1fd0886a389f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -759,6 +759,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd") -- cgit v1.2.3 From 91e0ddadffd88c9a9f8ff3443fbb5f86bc2813f7 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 30 Jan 2015 13:12:06 +1100 Subject: mm: don't split THP page when syscall is called We don't need to split THP page when MADV_FREE syscall is called. It could be done when VM decide really frees it so we could avoid unnecessary THP split. Signed-off-by: Minchan Kim Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++++ mm/huge_memory.c | 35 +++++++++++++++++++++++++++++++++++ mm/madvise.c | 21 ++++++++++++++++++++- mm/rmap.c | 8 ++++++-- mm/vmscan.c | 28 ++++++++++++++++++---------- 5 files changed, 83 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f10b20f05159..44a840a53974 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +extern int madvise_free_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); @@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page, unsigned long address, enum page_check_address_pmd_flag flag, spinlock_t **ptl); +extern int pmd_freeable(pmd_t pmd); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<mm; + int ret = 1; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + struct page *page; + pmd_t orig_pmd; + + orig_pmd = pmdp_get_and_clear(mm, addr, pmd); + + /* No hugepage in swapcache */ + page = pmd_page(orig_pmd); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + spin_unlock(ptl); + ret = 0; + } + + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1587,6 +1617,11 @@ unlock: return NULL; } +int pmd_freeable(pmd_t pmd) +{ + return !pmd_dirty(pmd); +} + static int __split_huge_page_splitting(struct page *page, struct vm_area_struct *vma, unsigned long address) diff --git a/mm/madvise.c b/mm/madvise.c index 0f9e518ad8a5..3288cdfdb79a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -271,8 +271,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; pte_t *pte, ptent; struct page *page; + unsigned long next; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) { +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif + split_huge_page_pmd(vma, addr, pmd); + } else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr)) + goto next; + /* fall through */ + } - split_huge_page_pmd(vma, addr, pmd); if (pmd_trans_unstable(pmd)) return 0; @@ -316,6 +334,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); +next: cond_resched(); return 0; } diff --git a/mm/rmap.c b/mm/rmap.c index 47facbc6f99e..3ba130b1e469 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -746,9 +746,13 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, referenced++; /* - * In this implmentation, MADV_FREE doesn't support THP free + * Use pmd_freeable instead of raw pmd_dirty because in some + * of architecture, pmd_dirty is not defined unless + * CONFIG_TRANSPARNTE_HUGE is enabled */ - dirty++; + if (!pmd_freeable(*pmd)) + dirty++; + spin_unlock(ptl); } else { pte_t *pte; diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c89c6ca7f76..83a0c6df48f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1018,17 +1018,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page) && !freeable) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (!add_to_swap(page, page_list)) - goto activate_locked; - may_enter_fs = 1; - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + if (PageAnon(page) && !PageSwapCache(page)) { + if (!freeable) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (!add_to_swap(page, page_list)) + goto activate_locked; + may_enter_fs = 1; + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } else { + if (likely(!PageTransHuge(page))) + goto unmap; + /* try_to_unmap isn't aware of THP page */ + if (unlikely(split_huge_page_to_list(page, + page_list))) + goto keep_locked; + } } - +unmap: /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. -- cgit v1.2.3 From a92823be6ab9c7ed56ee9d8e944c2d414d9034dc Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Fri, 30 Jan 2015 13:12:07 +1100 Subject: mm/zpool: add name argument to create zpool Currently the underlay of zpool: zsmalloc/zbud, do not know who creates them. There is not a method to let zsmalloc/zbud find which caller they belong to. Now we want to add statistics collection in zsmalloc. We need to name the debugfs dir for each pool created. The way suggested by Minchan Kim is to use a name passed by caller(such as zram) to create the zsmalloc pool. /sys/kernel/debug/zsmalloc/zram0 This patch adds an argument `name' to zs_create_pool() and other related functions. Signed-off-by: Ganesh Mahendran Acked-by: Minchan Kim Cc: Seth Jennings Cc: Nitin Gupta Cc: Dan Streetman Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 +++++--- include/linux/zpool.h | 5 +++-- include/linux/zsmalloc.h | 2 +- mm/zbud.c | 3 ++- mm/zpool.c | 6 ++++-- mm/zsmalloc.c | 6 +++--- mm/zswap.c | 5 +++-- 7 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0e07652cf7c1..aa5a4c54f057 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -327,9 +327,10 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize) kfree(meta); } -static struct zram_meta *zram_meta_alloc(u64 disksize) +static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) { size_t num_pages; + char pool_name[8]; struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); if (!meta) @@ -342,7 +343,8 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto out_error; } - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); + snprintf(pool_name, sizeof(pool_name), "zram%d", device_id); + meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; @@ -762,7 +764,7 @@ static ssize_t disksize_store(struct device *dev, return -EINVAL; disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(disksize); + meta = zram_meta_alloc(zram->disk->first_minor, disksize); if (!meta) return -ENOMEM; diff --git a/include/linux/zpool.h b/include/linux/zpool.h index f14bd75f08b3..56529b34dc63 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -36,7 +36,8 @@ enum zpool_mapmode { ZPOOL_MM_DEFAULT = ZPOOL_MM_RW }; -struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); +struct zpool *zpool_create_pool(char *type, char *name, + gfp_t gfp, struct zpool_ops *ops); char *zpool_get_type(struct zpool *pool); @@ -80,7 +81,7 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(gfp_t gfp, struct zpool_ops *ops); + void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops); void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 05c214760977..3283c6a55425 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -36,7 +36,7 @@ enum zs_mapmode { struct zs_pool; -struct zs_pool *zs_create_pool(gfp_t flags); +struct zs_pool *zs_create_pool(char *name, gfp_t flags); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size); diff --git a/mm/zbud.c b/mm/zbud.c index 4e387bea702e..2ee4e4520493 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zbud_zpool_create(char *name, gfp_t gfp, + struct zpool_ops *zpool_ops) { return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); } diff --git a/mm/zpool.c b/mm/zpool.c index 739cdf0d183a..bacdab6e47de 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver) /** * zpool_create_pool() - Create a new zpool * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @name The name of the zpool (e.g. zram0, zswap) * @gfp The GFP flags to use when allocating the pool. * @ops The optional ops callback. * @@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, + struct zpool_ops *ops) { struct zpool_driver *driver; struct zpool *zpool; @@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) zpool->type = driver->type; zpool->driver = driver; - zpool->pool = driver->create(gfp, ops); + zpool->pool = driver->create(name, gfp, ops); zpool->ops = ops; if (!zpool->pool) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b72403927aa4..2359e61b02bf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -246,9 +246,9 @@ struct mapping_area { #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) { - return zs_create_pool(gfp); + return zs_create_pool(name, gfp); } static void zs_zpool_destroy(void *pool) @@ -1148,7 +1148,7 @@ EXPORT_SYMBOL_GPL(zs_free); * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(gfp_t flags) +struct zs_pool *zs_create_pool(char *name, gfp_t flags) { int i; struct zs_pool *pool; diff --git a/mm/zswap.c b/mm/zswap.c index 0cfce9bc51e4..4249e82ff934 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -906,11 +906,12 @@ static int __init init_zswap(void) pr_info("loading zswap\n"); - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { pr_info("%s zpool not available\n", zswap_zpool_type); zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, &zswap_zpool_ops); } if (!zswap_pool) { -- cgit v1.2.3 From f8731a4706c2e0a32ed6679d6795fe051744b13b Mon Sep 17 00:00:00 2001 From: Petr Cermak Date: Fri, 30 Jan 2015 13:12:07 +1100 Subject: fs/proc/task_mmu.c: add user-space support for resetting mm->hiwater_rss (peak RSS) Peak resident size of a process can be reset back to the process's current rss value by writing "5" to /proc/pid/clear_refs. The driving use-case for this would be getting the peak RSS value, which can be retrieved from the VmHWM field in /proc/pid/status, per benchmark iteration or test scenario. [akpm@linux-foundation.org: clarify behaviour in documentation] Signed-off-by: Petr Cermak Cc: Bjorn Helgaas Cc: Primiano Tucci Cc: Petr Cermak Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.txt | 4 ++++ fs/proc/task_mmu.c | 14 ++++++++++++++ include/linux/mm.h | 5 +++++ 3 files changed, 23 insertions(+) (limited to 'include') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 6d59ffe791ad..7f79bd89d12e 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -489,6 +489,10 @@ To clear the bits for the file mapped pages associated with the process To clear the soft-dirty bit > echo 4 > /proc/PID/clear_refs +To reset the peak resident set size ("high water mark") to the process's +current value: + > echo 5 > /proc/PID/clear_refs + Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cd1eb80d5e88..3f2eed488722 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -732,6 +732,7 @@ enum clear_refs_types { CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; @@ -907,6 +908,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .mm = mm, .private = &cp, }; + + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + down_write(&mm->mmap_sem); + reset_mm_hiwater_rss(mm); + up_write(&mm->mmap_sem); + goto out_mm; + } + down_read(&mm->mmap_sem); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { @@ -928,6 +941,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); +out_mm: mmput(mm); } put_task_struct(task); diff --git a/include/linux/mm.h b/include/linux/mm.h index 39c7f4f09555..4661b3aa373b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1401,6 +1401,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm) mm->hiwater_vm = mm->total_vm; } +static inline void reset_mm_hiwater_rss(struct mm_struct *mm) +{ + mm->hiwater_rss = get_mm_rss(mm); +} + static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { -- cgit v1.2.3 From b64be3da196f7f31bdfb7ae627500daa721d4097 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 Jan 2015 13:12:09 +1100 Subject: all arches, signal: move restart_block to struct task_struct If an attacker can cause a controlled kernel stack overflow, overwriting the restart block is a very juicy exploit target. This is because the restart_block is held in the same memory allocation as the kernel stack. Moving the restart block to struct task_struct prevents this exploit by making the restart_block harder to locate. Note that there are other fields in thread_info that are also easy targets, at least on some architectures. It's also a decent simplification, since the restart code is more or less identical on all architectures. Signed-off-by: Andy Lutomirski Cc: Thomas Gleixner Cc: Al Viro Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: David Miller Acked-by: Richard Weinberger Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Steven Miao Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Richard Kuo Cc: "Luck, Tony" Cc: Geert Uytterhoeven Cc: James Hogan Cc: Michal Simek Cc: Ralf Baechle Cc: Jonas Bonn Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Michael Ellerman (powerpc) Tested-by: Michael Ellerman (powerpc) Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chen Liqin Cc: Lennox Wu Cc: Chris Metcalf Cc: Guan Xuetao Cc: Chris Zankel Cc: Max Filippov Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- arch/alpha/include/asm/thread_info.h | 5 ----- arch/alpha/kernel/signal.c | 2 +- arch/arc/include/asm/thread_info.h | 4 ---- arch/arc/kernel/signal.c | 2 +- arch/arm/include/asm/thread_info.h | 4 ---- arch/arm/kernel/signal.c | 4 ++-- arch/arm64/include/asm/thread_info.h | 4 ---- arch/arm64/kernel/signal.c | 2 +- arch/arm64/kernel/signal32.c | 4 ++-- arch/avr32/include/asm/thread_info.h | 4 ---- arch/avr32/kernel/asm-offsets.c | 1 - arch/avr32/kernel/signal.c | 2 +- arch/blackfin/include/asm/thread_info.h | 4 ---- arch/blackfin/kernel/signal.c | 2 +- arch/c6x/include/asm/thread_info.h | 4 ---- arch/c6x/kernel/signal.c | 2 +- arch/cris/arch-v10/kernel/signal.c | 2 +- arch/cris/arch-v32/kernel/signal.c | 2 +- arch/cris/include/asm/thread_info.h | 4 ---- arch/frv/include/asm/thread_info.h | 4 ---- arch/frv/kernel/asm-offsets.c | 1 - arch/frv/kernel/signal.c | 2 +- arch/hexagon/include/asm/thread_info.h | 4 ---- arch/hexagon/kernel/signal.c | 2 +- arch/ia64/include/asm/thread_info.h | 4 ---- arch/ia64/kernel/signal.c | 2 +- arch/m32r/include/asm/thread_info.h | 5 ----- arch/m32r/kernel/signal.c | 2 +- arch/m68k/include/asm/thread_info.h | 4 ---- arch/m68k/kernel/signal.c | 4 ++-- arch/metag/include/asm/thread_info.h | 4 ---- arch/metag/kernel/signal.c | 2 +- arch/microblaze/include/asm/thread_info.h | 4 ---- arch/microblaze/kernel/signal.c | 2 +- arch/mips/include/asm/thread_info.h | 4 ---- arch/mips/kernel/asm-offsets.c | 1 - arch/mips/kernel/signal.c | 2 +- arch/mips/kernel/signal32.c | 2 +- arch/mn10300/include/asm/thread_info.h | 4 ---- arch/mn10300/kernel/asm-offsets.c | 1 - arch/mn10300/kernel/signal.c | 2 +- arch/openrisc/include/asm/thread_info.h | 4 ---- arch/openrisc/kernel/signal.c | 2 +- arch/parisc/include/asm/thread_info.h | 4 ---- arch/parisc/kernel/signal.c | 2 +- arch/powerpc/include/asm/thread_info.h | 4 ---- arch/powerpc/kernel/signal_32.c | 4 ++-- arch/powerpc/kernel/signal_64.c | 2 +- arch/s390/include/asm/thread_info.h | 4 ---- arch/s390/kernel/compat_signal.c | 2 +- arch/s390/kernel/signal.c | 2 +- arch/score/include/asm/thread_info.h | 4 ---- arch/score/kernel/asm-offsets.c | 1 - arch/score/kernel/signal.c | 2 +- arch/sh/include/asm/thread_info.h | 4 ---- arch/sh/kernel/asm-offsets.c | 1 - arch/sh/kernel/signal_32.c | 4 ++-- arch/sh/kernel/signal_64.c | 4 ++-- arch/sparc/include/asm/thread_info_32.h | 6 ------ arch/sparc/include/asm/thread_info_64.h | 12 +++--------- arch/sparc/kernel/signal32.c | 4 ++-- arch/sparc/kernel/signal_32.c | 2 +- arch/sparc/kernel/signal_64.c | 2 +- arch/sparc/kernel/traps_64.c | 2 -- arch/tile/include/asm/thread_info.h | 4 ---- arch/tile/kernel/signal.c | 2 +- arch/um/include/asm/thread_info.h | 4 ---- arch/unicore32/include/asm/thread_info.h | 4 ---- arch/unicore32/kernel/signal.c | 2 +- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/thread_info.h | 4 ---- arch/x86/kernel/signal.c | 2 +- arch/x86/um/signal.c | 2 +- arch/xtensa/include/asm/thread_info.h | 5 ----- arch/xtensa/kernel/signal.c | 2 +- fs/select.c | 2 +- include/linux/init_task.h | 3 +++ include/linux/sched.h | 2 ++ kernel/compat.c | 5 ++--- kernel/futex.c | 2 +- kernel/signal.c | 2 +- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 3 +-- 84 files changed, 61 insertions(+), 193 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 48bbea6898b3..d5b98ab514bb 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -27,8 +27,6 @@ struct thread_info { int bpt_nsaved; unsigned long bpt_addr[2]; /* breakpoint handling */ unsigned int bpt_insn[2]; - - struct restart_block restart_block; }; /* @@ -40,9 +38,6 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 6cec2881acbf..8dbfb15f1745 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) struct switch_stack *sw = (struct switch_stack *)regs - 1; long i, err = __get_user(regs->pc, &sc->sc_pc); - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; sw->r26 = (unsigned long) ret_from_sys_call; diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 02bc5ec0fb2e..1163a1838ac1 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -46,7 +46,6 @@ struct thread_info { struct exec_domain *exec_domain;/* execution domain */ __u32 cpu; /* current CPU */ unsigned long thr_ptr; /* TLS ptr */ - struct restart_block restart_block; }; /* @@ -62,9 +61,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index cb3142a2d40b..114234e83caa 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn) struct pt_regs *regs = current_pt_regs(); /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* Since we stacked the signal on a word boundary, * then 'sp' should be word aligned here. If it's diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index d890e41f5520..72812a1f3d1c 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -68,7 +68,6 @@ struct thread_info { #ifdef CONFIG_ARM_THUMBEE unsigned long thumbee_state; /* ThumbEE Handler Base register */ #endif - struct restart_block restart_block; }; #define INIT_THREAD_INFO(tsk) \ @@ -81,9 +80,6 @@ struct thread_info { .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \ domain_val(DOMAIN_IO, DOMAIN_CLIENT), \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 8aa6f1b87c9e..023ac905e4c3 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs) struct sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, @@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 459bf8e53208..702e1e6a0d80 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -48,7 +48,6 @@ struct thread_info { mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - struct restart_block restart_block; int preempt_count; /* 0 => preemptable, <0 => bug */ int cpu; /* cpu */ }; @@ -60,9 +59,6 @@ struct thread_info { .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 6fa792137eda..660ccf9f7524 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 128-bit boundary, then 'sp' should diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 5a1ba6e80d4e..64565c4ecbbc 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -347,7 +347,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs) struct compat_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, @@ -381,7 +381,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs) struct compat_rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index a978f3fe7c25..d56afa99a514 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -30,7 +30,6 @@ struct thread_info { saved by debug handler when setting up trampoline */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -41,9 +40,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall \ - } \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c index d6a8193a1d2f..e41c84516e5d 100644 --- a/arch/avr32/kernel/asm-offsets.c +++ b/arch/avr32/kernel/asm-offsets.c @@ -18,7 +18,6 @@ void foo(void) OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_rar_saved, thread_info, rar_saved); OFFSET(TI_rsr_saved, thread_info, rsr_saved); - OFFSET(TI_restart_block, thread_info, restart_block); BLANK(); OFFSET(TSK_active_mm, task_struct, active_mm); BLANK(); diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index d309fbcc3bd6..8f1c63b9b983 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) sigset_t set; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)regs->sp; pr_debug("SIG return: frame = %p\n", frame); diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index 55f473bdad36..57c3a8bd583d 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -42,7 +42,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* address limit */ - struct restart_block restart_block; #ifndef CONFIG_SMP struct l1_scratch_task_info l1_task_info; #endif @@ -58,9 +57,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index ef275571d885..f2a8b5493bd3 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; #define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x) diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h index d4e9ef87076d..584e253f3217 100644 --- a/arch/c6x/include/asm/thread_info.h +++ b/arch/c6x/include/asm/thread_info.h @@ -45,7 +45,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 = preemptable, <0 = BUG */ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; }; /* @@ -61,9 +60,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index fe68226f6c4d..3c4bb5a5c382 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs) sigset_t set; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a dword boundary, diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 9b32d338838b..74d7ba35120d 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) unsigned long old_usp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* restore the regs from &sc->regs (same as sc, since regs is first) * (sc is already checked for VERIFY_READ since the sigframe was diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index 78ce3b1c9bcb..870e3e069318 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) unsigned long old_usp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Restore the registers from &sc->regs. sc is already checked diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 55dede18c032..7286db5ed90e 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -38,7 +38,6 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -56,9 +55,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index af29e17c0181..6b917f1c2955 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -41,7 +41,6 @@ struct thread_info { * 0-0xBFFFFFFF for user-thead * 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -65,9 +64,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c index 9de96843a278..446e89d500cc 100644 --- a/arch/frv/kernel/asm-offsets.c +++ b/arch/frv/kernel/asm-offsets.c @@ -40,7 +40,6 @@ void foo(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PREEMPT_COUNT, thread_info, preempt_count); OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); BLANK(); /* offsets into register file storage */ diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index dc3d59de0870..336713ab4745 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8) unsigned long tbr, psr; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; tbr = user->i.tbr; psr = user->i.psr; diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h index a59dad3b3695..bacd3d6030c5 100644 --- a/arch/hexagon/include/asm/thread_info.h +++ b/arch/hexagon/include/asm/thread_info.h @@ -56,7 +56,6 @@ struct thread_info { * used for syscalls somehow; * seems to have a function pointer and four arguments */ - struct restart_block restart_block; /* Points to the current pt_regs frame */ struct pt_regs *regs; /* @@ -83,9 +82,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = 1, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .sp = 0, \ .regs = NULL, \ } diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index eadd70e47e7e..b039a624c170 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void) sigset_t blocked; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)pt_psp(regs); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 5b17418b4223..c16f21a068ff 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -27,7 +27,6 @@ struct thread_info { __u32 status; /* Thread synchronous flags */ mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ - struct restart_block restart_block; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE __u64 ac_stamp; __u64 ac_leave; @@ -46,9 +45,6 @@ struct thread_info { .cpu = 0, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #ifndef ASM_OFFSETS_C diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 6d92170be457..b3a124da71e5 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) long err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* restore scratch that always needs gets updated during signal delivery: */ err = __get_user(flags, &sc->sc_flags); diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 00171703402f..32422d0211c3 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -34,7 +34,6 @@ struct thread_info { 0-0xBFFFFFFF for user-thread 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -49,7 +48,6 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C #endif @@ -68,9 +66,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 95408b8f130a..7736c6660a15 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; #define COPY(x) err |= __get_user(regs->x, &sc->sc_##x) COPY(r4); diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 21a4784ca5a1..c54256e69e64 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ unsigned long tp_value; /* thread pointer */ - struct restart_block restart_block; }; #endif /* __ASSEMBLY__ */ @@ -41,9 +40,6 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_stack (init_thread_union.stack) diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 967a8b7e1527..d7179281e74a 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* get previous context */ if (copy_from_user(&context, usc, sizeof(context))) @@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = __get_user(temp, &uc->uc_mcontext.version); if (temp != MCONTEXT_VERSION) diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index 47711336119e..ff4332435d15 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -35,7 +35,6 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; u8 supervisor_stack[0]; }; @@ -74,9 +73,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c index 0d100d5c1407..ce49d429c74a 100644 --- a/arch/metag/kernel/signal.c +++ b/arch/metag/kernel/signal.c @@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs, int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL, &sc->regs); diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 8c9d36591a03..b699fbd7de4a 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -71,7 +71,6 @@ struct thread_info { __u32 cpu; /* current CPU */ __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; struct cpu_context cpu_context; }; @@ -87,9 +86,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 8955a3829cf0..0245c27fa720 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) int rval; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 99eea59604e9..75a8c55d3dc1 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -34,7 +34,6 @@ struct thread_info { * 0x7fffffff for user-thead * 0xffffffff for kernel-thread */ - struct restart_block restart_block; struct pt_regs *regs; }; @@ -49,9 +48,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index b1d84bd4efb3..3b2dfdb4865f 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -98,7 +98,6 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); OFFSET(TI_REGS, thread_info, regs); DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 545bf11bd2ed..6a28c792d862 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) int i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err |= __get_user(regs->cp0_epc, &sc->sc_pc); diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index d69179c0d49d..19a7705f2a01 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs, int i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err |= __get_user(regs->cp0_epc, &sc->sc_pc); err |= __get_user(regs->hi, &sc->sc_mdhi); diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index bf280eaccd36..c1c374f0ec12 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -50,7 +50,6 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -80,9 +79,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c index 47b3bb0c04ff..d780670cbaf3 100644 --- a/arch/mn10300/kernel/asm-offsets.c +++ b/arch/mn10300/kernel/asm-offsets.c @@ -28,7 +28,6 @@ void foo(void) OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_restart_block, thread_info, restart_block); BLANK(); OFFSET(REG_D0, pt_regs, d0); diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index a6c0858592c3..8609845f12c5 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (is_using_fpu(current)) fpu_kill_state(current); diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index d797acc901e4..875f0845a707 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { 0-0x7FFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; /* saved context data */ @@ -79,9 +78,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = 1, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .ksp = 0, \ } diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 7d1b8235bf90..4112175bf803 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs, int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Restore the regs from &sc->regs. diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index a84611835549..fb13e3865563 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -14,7 +14,6 @@ struct thread_info { mm_segment_t addr_limit; /* user-level address space limit */ __u32 cpu; /* current CPU */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ - struct restart_block restart_block; }; #define INIT_THREAD_INFO(tsk) \ @@ -25,9 +24,6 @@ struct thread_info { .cpu = 0, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall \ - } \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 012d4fa63d97..9b910a0251b8 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall) sigframe_size = PARISC_RT_SIGFRAME_SIZE32; #endif - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* Unwind the user stack to get the rt_sigframe structure. */ frame = (struct rt_sigframe __user *) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 0be6c681cab1..e74d9390bdc2 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -43,7 +43,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - struct restart_block restart_block; unsigned long local_flags; /* private flags for thread */ /* low level flags - has atomic operations done on it */ @@ -59,9 +58,6 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .flags = 0, \ } diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index b171001698ff..d3a831ac0f92 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1231,7 +1231,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, int tm_restore = 0; #endif /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; rt_sf = (struct rt_sigframe __user *) (regs->gpr[1] + __SIGNAL_FRAMESIZE + 16); @@ -1504,7 +1504,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, #endif /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); sc = &sf->sctx; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 2cb0c94cafa5..c7c24d2e2bdb 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, #endif /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, uc, sizeof(*uc))) goto badframe; diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 4d62fd5b56e5..ef1df718642d 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -39,7 +39,6 @@ struct thread_info { unsigned long sys_call_table; /* System call table address */ unsigned int cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - struct restart_block restart_block; unsigned int system_call; __u64 user_timer; __u64 system_timer; @@ -56,9 +55,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 34d5fa7b01b5..bc1df12dd4f8 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -209,7 +209,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) int i; /* Alwys make any pending restarted system call return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) return -EFAULT; diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 6a2ac257d98f..b3ae6f70c6d6 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) _sigregs user_sregs; /* Alwys make any pending restarted system call return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) return -EFAULT; diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h index 656b7ada9326..33864fa2a8d4 100644 --- a/arch/score/include/asm/thread_info.h +++ b/arch/score/include/asm/thread_info.h @@ -42,7 +42,6 @@ struct thread_info { * 0-0xFFFFFFFF for kernel-thread */ mm_segment_t addr_limit; - struct restart_block restart_block; struct pt_regs *regs; }; @@ -58,9 +57,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = 1, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c index 57788f44c6fb..b4d5214a7a7e 100644 --- a/arch/score/kernel/asm-offsets.c +++ b/arch/score/kernel/asm-offsets.c @@ -106,7 +106,6 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); OFFSET(TI_REGS, thread_info, regs); DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE); DEFINE(KERNEL_STACK_MASK, THREAD_MASK); diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index 1651807774ad..e381c8c4ff65 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs) int sig; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *) regs->regs[0]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index ad27ffa65e2e..657c03919627 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -33,7 +33,6 @@ struct thread_info { __u32 cpu; int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; unsigned long previous_sp; /* sp of previous stack in case of nested IRQ stacks */ __u8 supervisor_stack[0]; @@ -63,9 +62,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c index 08a2be775b6c..542225fedb11 100644 --- a/arch/sh/kernel/asm-offsets.c +++ b/arch/sh/kernel/asm-offsets.c @@ -25,7 +25,6 @@ int main(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block)); DEFINE(TI_SIZE, sizeof(struct thread_info)); #ifdef CONFIG_HIBERNATION diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 2f002b24fb92..0b34f2a704fe 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void) int r0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void) int r0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 897abe7b871e..71993c6a7d94 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3, long long ret; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3, long long ret; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 025c98446b1e..fd7bd0a440ca 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -47,8 +47,6 @@ struct thread_info { struct reg_window32 reg_window[NSWINS]; /* align for ldd! */ unsigned long rwbuf_stkptrs[NSWINS]; unsigned long w_saved; - - struct restart_block restart_block; }; /* @@ -62,9 +60,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) @@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TI_REG_WINDOW 0x30 #define TI_RWIN_SPTRS 0x230 #define TI_W_SAVED 0x250 -/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */ /* * thread information flag bit numbers diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 798f0279a4b5..ff455164732a 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -58,8 +58,6 @@ struct thread_info { unsigned long gsr[7]; unsigned long xfsr[7]; - struct restart_block restart_block; - struct pt_regs *kern_una_regs; unsigned int kern_una_insn; @@ -92,10 +90,9 @@ struct thread_info { #define TI_RWIN_SPTRS 0x000003c8 #define TI_GSR 0x00000400 #define TI_XFSR 0x00000438 -#define TI_RESTART_BLOCK 0x00000470 -#define TI_KUNA_REGS 0x000004a0 -#define TI_KUNA_INSN 0x000004a8 -#define TI_FPREGS 0x000004c0 +#define TI_KUNA_REGS 0x00000470 +#define TI_KUNA_INSN 0x00000478 +#define TI_FPREGS 0x00000480 /* We embed this in the uppermost byte of thread_info->flags */ #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */ @@ -124,9 +121,6 @@ struct thread_info { .current_ds = ASI_P, \ .exec_domain = &default_exec_domain, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 62deba7be1a9..4eed773a7735 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -150,7 +150,7 @@ void do_sigreturn32(struct pt_regs *regs) int err, i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack(); @@ -235,7 +235,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs) int err, i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack(); regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL; diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 9ee72fc8e0e4..52aa5e4ce5e7 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -70,7 +70,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs) int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack(); diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 1a6999868031..d88beff47bab 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -254,7 +254,7 @@ void do_rt_sigreturn(struct pt_regs *regs) int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack (); sf = (struct rt_signal_frame __user *) diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 981a769b9558..a27651e866e7 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2730,8 +2730,6 @@ void __init trap_init(void) TI_NEW_CHILD != offsetof(struct thread_info, new_child) || TI_CURRENT_DS != offsetof(struct thread_info, current_ds) || - TI_RESTART_BLOCK != offsetof(struct thread_info, - restart_block) || TI_KUNA_REGS != offsetof(struct thread_info, kern_una_regs) || TI_KUNA_INSN != offsetof(struct thread_info, diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 48e4fd0f38e4..96c14c1430d8 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -36,7 +36,6 @@ struct thread_info { mm_segment_t addr_limit; /* thread address space (KERNEL_DS or USER_DS) */ - struct restart_block restart_block; struct single_step_state *step_state; /* single step state (if non-zero) */ int align_ctl; /* controls unaligned access */ @@ -57,9 +56,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .step_state = NULL, \ .align_ctl = 0, \ } diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index bb0a9ce7ae23..8a524e332c1a 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -48,7 +48,7 @@ int restore_sigcontext(struct pt_regs *regs, int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Enforce that sigcontext is like pt_regs, and doesn't mess diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 1c5b2a83046a..e04114c4fcd9 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -22,7 +22,6 @@ struct thread_info { mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user 0-0xFFFFFFFF for kernel */ - struct restart_block restart_block; struct thread_info *real_thread; /* Points to non-IRQ stack */ }; @@ -34,9 +33,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .real_thread = NULL, \ } diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h index af36d8eabdf1..63e2839dfeb8 100644 --- a/arch/unicore32/include/asm/thread_info.h +++ b/arch/unicore32/include/asm/thread_info.h @@ -79,7 +79,6 @@ struct thread_info { #ifdef CONFIG_UNICORE_FPU_F64 struct fp_state fpstate __attribute__((aligned(8))); #endif - struct restart_block restart_block; }; #define INIT_THREAD_INFO(tsk) \ @@ -89,9 +88,6 @@ struct thread_info { .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 7c8fb7018dc6..d329f85766cc 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9e181aaba97..d0165c9a2932 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, u32 tmp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc6..8550f2427d58 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - struct restart_block restart_block; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -45,9 +44,6 @@ struct thread_info { .cpu = 0, \ .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ed37a768d0fc..0a62df4abcf7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 79d824551c1a..0c8c32bfd792 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs, int err, pid; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = copy_from_user(&sc, from, sizeof(sc)); if (err) diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 470153e8547c..a9b5d3ba196c 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -51,7 +51,6 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; unsigned long cpenable; @@ -72,7 +71,6 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C #endif @@ -90,9 +88,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 4612321c73cc..3d733ba16f28 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3, int ret; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (regs->depc > 64) panic("rt_sigreturn in double exception!\n"); diff --git a/fs/select.c b/fs/select.c index 467bb1cb3ea5..f684c750e08a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, if (ret == -EINTR) { struct restart_block *restart_block; - restart_block = ¤t_thread_info()->restart_block; + restart_block = ¤t->restart_block; restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 3037fc085e8e..d3d43ecf148c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -193,6 +193,9 @@ extern struct task_group root_task_group; .nr_cpus_allowed= NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ .se = { \ .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ }, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8db31ef98d2f..22ee0d5d7f8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1370,6 +1370,8 @@ struct task_struct { unsigned long atomic_flags; /* Flags needing atomic access. */ + struct restart_block restart_block; + pid_t pid; pid_t tgid; diff --git a/kernel/compat.c b/kernel/compat.c index ebb3c369d03d..24f00610c575 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, * core implementation decides to return random nonsense. */ if (ret == -ERESTART_RESTARTBLOCK) { - struct restart_block *restart - = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; @@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = compat_clock_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; } diff --git a/kernel/futex.c b/kernel/futex.c index 63678b573d61..f4d8a85641ed 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2217,7 +2217,7 @@ retry: if (!abs_time) goto out; - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; diff --git a/kernel/signal.c b/kernel/signal.c index 16a305295256..33a52759cc0e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals); */ SYSCALL_DEFINE0(restart_syscall) { - struct restart_block *restart = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; return restart->fn(restart); } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index a7077d3ae52f..1b001ed1edb9 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp.tv64; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 37e50aadd471..dbcec65d08c4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1591,7 +1591,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a16b67859e2a..0075da74abf0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + struct restart_block *restart_block = ¤t->restart_block; struct itimerspec it; int error; -- cgit v1.2.3 From 1c644aeeeca58ceb281404e9af6bb6f8b15a3dcb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 30 Jan 2015 13:12:09 +1100 Subject: linux/types.h: Always use unsigned long for pgoff_t Everybody uses unsigned long for pgoff_t, and no one ever overrode the definition of pgoff_t. Keep it that way, and remove the option of overriding it. Signed-off-by: Geert Uytterhoeven Cc: Randy Dunlap Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/linux/types.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/types.h b/include/linux/types.h index a0bb7048687f..c26aa84be9d3 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -135,12 +135,9 @@ typedef unsigned long blkcnt_t; #endif /* - * The type of an index into the pagecache. Use a #define so asm/types.h - * can override it. + * The type of an index into the pagecache. */ -#ifndef pgoff_t #define pgoff_t unsigned long -#endif /* A dma_addr_t can hold any valid DMA or bus address for the platform */ #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT -- cgit v1.2.3 From 5bf1f21b777eaf27b072a335e13e560bceb1d187 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:11 +1100 Subject: libstring_helpers.c:string_get_size(): return void string_get_size() was documented to return an error, but in fact always returned 0. Since the output always fits in 9 bytes, just document that and let callers do what they do now: pass a small stack buffer and ignore the return value. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/string_helpers.h | 4 ++-- lib/string_helpers.c | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 6eb567ac56bc..657571817260 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -10,8 +10,8 @@ enum string_size_units { STRING_UNITS_2, /* use binary powers of 2^10 */ }; -int string_get_size(u64 size, enum string_size_units units, - char *buf, int len); +void string_get_size(u64 size, enum string_size_units units, + char *buf, int len); #define UNESCAPE_SPACE 0x01 #define UNESCAPE_OCTAL 0x02 diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 2b3757f84b3b..8f8c4417f228 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -20,12 +20,12 @@ * @len: length of buffer * * This function returns a string formatted to 3 significant figures - * giving the size in the required units. Returns 0 on success or - * error on failure. @buf is always zero terminated. + * giving the size in the required units. @buf should have room for + * at least 9 bytes and will always be zero terminated. * */ -int string_get_size(u64 size, const enum string_size_units units, - char *buf, int len) +void string_get_size(u64 size, const enum string_size_units units, + char *buf, int len) { static const char *const units_10[] = { "B", "kB", "MB", "GB", "TB", "PB", "EB" @@ -67,8 +67,6 @@ int string_get_size(u64 size, const enum string_size_units units, snprintf(buf, len, "%u%s %s", (u32)size, tmp, units_str[units][i]); - - return 0; } EXPORT_SYMBOL(string_get_size); -- cgit v1.2.3 From 214c5b87d29738e15b17b4dd6f35106028a9ae59 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:11 +1100 Subject: lib/bitmap.c: more signed->unsigned conversions For consistency with the other bitmap_* functions, also make the nbits parameter of bitmap_zero, bitmap_fill and bitmap_copy unsigned. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 202e4034fe26..1406d5453781 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -185,33 +185,33 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf, #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -static inline void bitmap_zero(unsigned long *dst, int nbits) +static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); memset(dst, 0, len); } } -static inline void bitmap_fill(unsigned long *dst, int nbits) +static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { - size_t nlongs = BITS_TO_LONGS(nbits); + unsigned int nlongs = BITS_TO_LONGS(nbits); if (!small_const_nbits(nbits)) { - int len = (nlongs - 1) * sizeof(unsigned long); + unsigned int len = (nlongs - 1) * sizeof(unsigned long); memset(dst, 0xff, len); } dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); } static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, - int nbits) + unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); memcpy(dst, src, len); } } -- cgit v1.2.3 From c2496b147e0952e69b641f149b032fb422a30d21 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:11 +1100 Subject: linux/nodemask.h: update bitmap wrappers to take unsigned int Since the various bitmap_* functions now take an unsigned int as nbits parameter, it makes sense to also update the various wrappers. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 83a6aeda899d..21cef483dc1b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -120,13 +120,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp) } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) -static inline void __nodes_setall(nodemask_t *dstp, int nbits) +static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) -static inline void __nodes_clear(nodemask_t *dstp, int nbits) +static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -144,7 +144,7 @@ static inline int __node_test_and_set(int node, nodemask_t *addr) #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -152,7 +152,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -160,7 +160,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -168,7 +168,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -176,7 +176,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, - const nodemask_t *srcp, int nbits) + const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } @@ -184,7 +184,7 @@ static inline void __nodes_complement(nodemask_t *dstp, #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_equal(const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } @@ -192,7 +192,7 @@ static inline int __nodes_equal(const nodemask_t *src1p, #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_intersects(const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } @@ -200,25 +200,25 @@ static inline int __nodes_intersects(const nodemask_t *src1p, #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_subset(const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) -static inline int __nodes_empty(const nodemask_t *srcp, int nbits) +static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) -static inline int __nodes_full(const nodemask_t *srcp, int nbits) +static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) -static inline int __nodes_weight(const nodemask_t *srcp, int nbits) +static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } -- cgit v1.2.3 From 8d033d18209862c1aec7ea259d39d58d3a0cf15c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:12 +1100 Subject: linux/cpumask.h: update bitmap wrappers to take unsigned int Since the various bitmap_* functions now take an unsigned int as nbits parameter, it makes sense to also update the various wrappers, even though they're marked as obsolete. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b950e9d6008b..ff9044286d88 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -905,13 +905,13 @@ static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp) } #define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS) -static inline void __cpus_setall(cpumask_t *dstp, int nbits) +static inline void __cpus_setall(cpumask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS) -static inline void __cpus_clear(cpumask_t *dstp, int nbits) +static inline void __cpus_clear(cpumask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -927,21 +927,21 @@ static inline int __cpu_test_and_set(int cpu, cpumask_t *addr) #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS) static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS) static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS) static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -949,40 +949,40 @@ static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, #define cpus_andnot(dst, src1, src2) \ __cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS) static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS) static inline int __cpus_equal(const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS) static inline int __cpus_intersects(const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS) static inline int __cpus_subset(const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define cpus_empty(src) __cpus_empty(&(src), NR_CPUS) -static inline int __cpus_empty(const cpumask_t *srcp, int nbits) +static inline int __cpus_empty(const cpumask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS) -static inline int __cpus_weight(const cpumask_t *srcp, int nbits) +static inline int __cpus_weight(const cpumask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } -- cgit v1.2.3 From 0dbd610ed26bb44b1ce81b1d42265699171bba2e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:12 +1100 Subject: lib/bitmap.c: update bitmap_onto to unsigned Change the nbits parameter of bitmap_onto to unsigned int for consistency with other bitmap_* functions. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 1406d5453781..d0c6214eb190 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -164,7 +164,7 @@ extern void bitmap_remap(unsigned long *dst, const unsigned long *src, extern int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, - const unsigned long *relmap, int bits); + const unsigned long *relmap, unsigned int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, int sz, int bits); extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); diff --git a/lib/bitmap.c b/lib/bitmap.c index 324ea9eab8c1..d8467980875d 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1006,9 +1006,9 @@ EXPORT_SYMBOL(bitmap_bitremap); * All bits in @dst not set by the above rule are cleared. */ void bitmap_onto(unsigned long *dst, const unsigned long *orig, - const unsigned long *relmap, int bits) + const unsigned long *relmap, unsigned int bits) { - int n, m; /* same meaning as in above comment */ + unsigned int n, m; /* same meaning as in above comment */ if (dst == orig) /* following doesn't handle inplace mappings */ return; -- cgit v1.2.3 From e235af7324eab19de885a5c4b8efa6c00a2d6487 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:12 +1100 Subject: lib/bitmap.c: change parameters of bitmap_fold to unsigned Change the sz and nbits parameters of bitmap_fold to unsigned int for consistency with other bitmap_* functions, and to save another few bytes in the generated code. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index d0c6214eb190..95dcd2f76e1a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -166,7 +166,7 @@ extern int bitmap_bitremap(int oldbit, extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, - int sz, int bits); + unsigned int sz, unsigned int nbits); extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); diff --git a/lib/bitmap.c b/lib/bitmap.c index ee598a496895..5ffb8db78ae1 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1046,15 +1046,15 @@ EXPORT_SYMBOL(bitmap_onto); * Example [2] for bitmap_onto() for why and how to use this. */ void bitmap_fold(unsigned long *dst, const unsigned long *orig, - int sz, int bits) + unsigned int sz, unsigned int nbits) { - int oldbit; + unsigned int oldbit; if (dst == orig) /* following doesn't handle inplace mappings */ return; - bitmap_zero(dst, bits); + bitmap_zero(dst, nbits); - for_each_set_bit(oldbit, orig, bits) + for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } EXPORT_SYMBOL(bitmap_fold); -- cgit v1.2.3 From f1e664abf4c6368c35ba92af53f6db28b3a25a52 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:13 +1100 Subject: lib/bitmap.c: simplify bitmap_ord_to_pos Make the return value and the ord and nbits parameters of bitmap_ord_to_pos unsigned. Also, simplify the implementation and as a side effect make the result fully defined, returning nbits for ord >= weight, in analogy with what find_{first,next}_bit does. This is a better sentinel than the former ("unofficial") 0. No current users are affected by this change. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 28 +++++++++++----------------- 2 files changed, 12 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 95dcd2f76e1a..1e74fe7aa167 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -171,7 +171,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); -extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); +extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); extern int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); diff --git a/lib/bitmap.c b/lib/bitmap.c index 84d20b5c6bf1..e8a38bde7af9 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -771,34 +771,28 @@ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigne * bitmap_ord_to_pos - find position of n-th set bit in bitmap * @buf: pointer to bitmap * @ord: ordinal bit position (n-th set bit, n >= 0) - * @bits: number of valid bit positions in @buf + * @nbits: number of valid bit positions in @buf * * Map the ordinal offset of bit @ord in @buf to its position in @buf. - * Value of @ord should be in range 0 <= @ord < weight(buf), else - * results are undefined. + * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord + * >= weight(buf), returns @nbits. * * If for example, just bits 4 through 7 are set in @buf, then @ord * values 0 through 3 will get mapped to 4 through 7, respectively, - * and all other @ord values return undefined values. When @ord value 3 + * and all other @ord values returns @nbits. When @ord value 3 * gets mapped to (returns) @pos value 7 in this example, that means * that the 3rd set bit (starting with 0th) is at position 7 in @buf. * - * The bit positions 0 through @bits are valid positions in @buf. + * The bit positions 0 through @nbits-1 are valid positions in @buf. */ -int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) +unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits) { - int pos = 0; + unsigned int pos; - if (ord >= 0 && ord < bits) { - int i; - - for (i = find_first_bit(buf, bits); - i < bits && ord > 0; - i = find_next_bit(buf, bits, i + 1)) - ord--; - if (i < bits && ord == 0) - pos = i; - } + for (pos = find_first_bit(buf, nbits); + pos < nbits && ord; + pos = find_next_bit(buf, nbits, pos + 1)) + ord--; return pos; } -- cgit v1.2.3 From 94d348e0d36901627c15fff62cba22a6160ea1b1 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:13 +1100 Subject: lib/bitmap.c: make the bits parameter of bitmap_remap unsigned Also, rename bits to nbits. Both changes for consistency with other bitmap_* functions. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 1e74fe7aa167..5f5c00de39f0 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -160,7 +160,7 @@ extern int bitmap_parselist(const char *buf, unsigned long *maskp, extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); extern void bitmap_remap(unsigned long *dst, const unsigned long *src, - const unsigned long *old, const unsigned long *new, int bits); + const unsigned long *old, const unsigned long *new, unsigned int nbits); extern int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, diff --git a/lib/bitmap.c b/lib/bitmap.c index e8a38bde7af9..ad161a6c82db 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -803,7 +803,7 @@ unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsig * @src: subset to be remapped * @old: defines domain of map * @new: defines range of map - * @bits: number of bits in each of these bitmaps + * @nbits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped @@ -831,22 +831,22 @@ unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsig */ void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, - int bits) + unsigned int nbits) { - int oldbit, w; + unsigned int oldbit, w; if (dst == src) /* following doesn't handle inplace remaps */ return; - bitmap_zero(dst, bits); + bitmap_zero(dst, nbits); - w = bitmap_weight(new, bits); - for_each_set_bit(oldbit, src, bits) { - int n = bitmap_pos_to_ord(old, oldbit, bits); + w = bitmap_weight(new, nbits); + for_each_set_bit(oldbit, src, nbits) { + int n = bitmap_pos_to_ord(old, oldbit, nbits); if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else - set_bit(bitmap_ord_to_pos(new, n % w, bits), dst); + set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst); } } EXPORT_SYMBOL(bitmap_remap); -- cgit v1.2.3 From 55ad36254cb7b68b526b482b2d13fb007f62a64b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:13 +1100 Subject: lib/string.c: remove strnicmp() Now that all in-tree users of strnicmp have been converted to strncasecmp, the wrapper can be removed. Signed-off-by: Rasmus Villemoes Cc: David Howells Cc: Heiko Carstens Signed-off-by: Andrew Morton --- arch/frv/include/asm/string.h | 1 - arch/s390/include/asm/string.h | 1 - include/linux/string.h | 3 --- lib/string.c | 8 -------- 4 files changed, 13 deletions(-) (limited to 'include') diff --git a/arch/frv/include/asm/string.h b/arch/frv/include/asm/string.h index 5ed310f64b7e..1f6c35990439 100644 --- a/arch/frv/include/asm/string.h +++ b/arch/frv/include/asm/string.h @@ -33,7 +33,6 @@ extern void *memcpy(void *, const void *, __kernel_size_t); #define __HAVE_ARCH_STRNCAT 1 #define __HAVE_ARCH_STRCMP 1 #define __HAVE_ARCH_STRNCMP 1 -#define __HAVE_ARCH_STRNICMP 1 #define __HAVE_ARCH_STRCHR 1 #define __HAVE_ARCH_STRRCHR 1 #define __HAVE_ARCH_STRSTR 1 diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 7e2dcd7c57ef..8662f5c8e17f 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -44,7 +44,6 @@ extern char *strstr(const char *, const char *); #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR #undef __HAVE_ARCH_STRNCMP -#undef __HAVE_ARCH_STRNICMP #undef __HAVE_ARCH_STRPBRK #undef __HAVE_ARCH_STRSEP #undef __HAVE_ARCH_STRSPN diff --git a/include/linux/string.h b/include/linux/string.h index 2e22a2e58f3a..b9bc9a5d9e21 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -40,9 +40,6 @@ extern int strcmp(const char *,const char *); #ifndef __HAVE_ARCH_STRNCMP extern int strncmp(const char *,const char *,__kernel_size_t); #endif -#ifndef __HAVE_ARCH_STRNICMP -#define strnicmp strncasecmp -#endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *s1, const char *s2); #endif diff --git a/lib/string.c b/lib/string.c index 10063300b830..3206d0178296 100644 --- a/lib/string.c +++ b/lib/string.c @@ -58,14 +58,6 @@ int strncasecmp(const char *s1, const char *s2, size_t len) } EXPORT_SYMBOL(strncasecmp); #endif -#ifndef __HAVE_ARCH_STRNICMP -#undef strnicmp -int strnicmp(const char *s1, const char *s2, size_t len) -{ - return strncasecmp(s1, s2, len); -} -EXPORT_SYMBOL(strnicmp); -#endif #ifndef __HAVE_ARCH_STRCASECMP int strcasecmp(const char *s1, const char *s2) -- cgit v1.2.3 From 16e8b721bc5afc293b641732b94b2aa5c8a6a9b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 30 Jan 2015 13:12:14 +1100 Subject: hexdump: makes it return number of bytes placed in buffer This patch makes hexdump return the number of bytes placed in the buffer excluding trailing NUL. In the case of overflow it returns the desired amount of bytes to produce the entire dump. Thus, it mimics snprintf(). This will be useful for users that would like to repeat with a bigger buffer. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/printk.h | 6 ++--- lib/hexdump.c | 73 +++++++++++++++++++++++++++++++++++++------------- lib/test-hexdump.c | 45 +++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index 4d5bf5726578..baa3f97d8ce8 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -417,9 +417,9 @@ enum { DUMP_PREFIX_ADDRESS, DUMP_PREFIX_OFFSET }; -extern void hex_dump_to_buffer(const void *buf, size_t len, - int rowsize, int groupsize, - char *linebuf, size_t linebuflen, bool ascii); +extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, + int groupsize, char *linebuf, size_t linebuflen, + bool ascii); #ifdef CONFIG_PRINTK extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, diff --git a/lib/hexdump.c b/lib/hexdump.c index 4af53f73c7cc..7ea09699855d 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -97,22 +97,26 @@ EXPORT_SYMBOL(bin2hex); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO + * + * Return: + * The amount of bytes placed in the buffer without terminating NUL. If the + * output was truncated, then the return value is the number of bytes + * (excluding the terminating NUL) which would have been written to the final + * string if enough space had been available. */ -void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, - int groupsize, char *linebuf, size_t linebuflen, - bool ascii) +int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, + char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; int ngroups; u8 ch; int j, lx = 0; int ascii_column; + int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; - if (!len) - goto nil; if (len > rowsize) /* limit to one line at a time */ len = rowsize; if (!is_power_of_2(groupsize) || groupsize > 8) @@ -122,27 +126,50 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, ngroups = len / groupsize; ascii_column = rowsize * 2 + rowsize / groupsize + 1; + + if (!linebuflen) + goto overflow1; + + if (!len) + goto nil; + if (groupsize == 8) { const u64 *ptr8 = buf; - for (j = 0; j < ngroups; j++) - lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%16.16llx", j ? " " : "", - (unsigned long long)*(ptr8 + j)); + for (j = 0; j < ngroups; j++) { + ret = snprintf(linebuf + lx, linebuflen - lx, + "%s%16.16llx", j ? " " : "", + (unsigned long long)*(ptr8 + j)); + if (ret >= linebuflen - lx) + goto overflow1; + lx += ret; + } } else if (groupsize == 4) { const u32 *ptr4 = buf; - for (j = 0; j < ngroups; j++) - lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%8.8x", j ? " " : "", *(ptr4 + j)); + for (j = 0; j < ngroups; j++) { + ret = snprintf(linebuf + lx, linebuflen - lx, + "%s%8.8x", j ? " " : "", + *(ptr4 + j)); + if (ret >= linebuflen - lx) + goto overflow1; + lx += ret; + } } else if (groupsize == 2) { const u16 *ptr2 = buf; - for (j = 0; j < ngroups; j++) - lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%4.4x", j ? " " : "", *(ptr2 + j)); + for (j = 0; j < ngroups; j++) { + ret = snprintf(linebuf + lx, linebuflen - lx, + "%s%4.4x", j ? " " : "", + *(ptr2 + j)); + if (ret >= linebuflen - lx) + goto overflow1; + lx += ret; + } } else { - for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) { + for (j = 0; j < len; j++) { + if (linebuflen < lx + 3) + goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); linebuf[lx++] = hex_asc_lo(ch); @@ -154,14 +181,24 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, if (!ascii) goto nil; - while (lx < (linebuflen - 1) && lx < ascii_column) + while (lx < ascii_column) { + if (linebuflen < lx + 2) + goto overflow2; linebuf[lx++] = ' '; - for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) { + } + for (j = 0; j < len; j++) { + if (linebuflen < lx + 2) + goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: + linebuf[lx] = '\0'; + return lx; +overflow2: linebuf[lx++] = '\0'; +overflow1: + return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c index 9d3bd1e9ae48..d10f90b5193f 100644 --- a/lib/test-hexdump.c +++ b/lib/test-hexdump.c @@ -114,6 +114,45 @@ static void __init test_hexdump_set(int rowsize, bool ascii) test_hexdump(len, rowsize, 1, ascii); } +static void __init test_hexdump_overflow(bool ascii) +{ + char buf[56]; + const char *t = test_data_1_le[0]; + size_t l = get_random_int() % sizeof(buf); + bool a; + int e, r; + + memset(buf, ' ', sizeof(buf)); + + r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); + + if (ascii) + e = 50; + else + e = 2; + buf[e + 2] = '\0'; + + if (!l) { + a = r == e && buf[0] == ' '; + } else if (l < 3) { + a = r == e && buf[0] == '\0'; + } else if (l < 4) { + a = r == e && !strcmp(buf, t); + } else if (ascii) { + if (l < 51) + a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' '; + else + a = r == e && buf[50] == '\0' && buf[49] == '.'; + } else { + a = r == e && buf[e] == '\0'; + } + + if (!a) { + pr_err("Len: %zu rc: %zu strlen: %zu\n", l, r, strlen(buf)); + pr_err("Result: '%s'\n", buf); + } +} + static int __init test_hexdump_init(void) { unsigned int i; @@ -129,6 +168,12 @@ static int __init test_hexdump_init(void) for (i = 0; i < 16; i++) test_hexdump_set(rowsize, true); + for (i = 0; i < 16; i++) + test_hexdump_overflow(false); + + for (i = 0; i < 16; i++) + test_hexdump_overflow(true); + return -EINVAL; } module_init(test_hexdump_init); -- cgit v1.2.3 From 7ad16cb5e47d9a2d0e4ea4693262b1b5e299876d Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:15 +1100 Subject: lib/halfmd4.c: simplify includes We only need EXPORT_SYMBOL, so compiler.h and export.h suffice. This means linux/types.h is no longer implicitly included, so add an include of uapi/linux/types.h to linux/cryptohash.h for __u32. Other users of cryptohash.h cannot be affected, since they must already have been including uapi/linux/types.h in order for gcc not to complain about unknown types. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/cryptohash.h | 2 ++ lib/halfmd4.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h index 2cd9f1cf9fa3..f4754282c9c2 100644 --- a/include/linux/cryptohash.h +++ b/include/linux/cryptohash.h @@ -1,6 +1,8 @@ #ifndef __CRYPTOHASH_H #define __CRYPTOHASH_H +#include + #define SHA_DIGEST_WORDS 5 #define SHA_MESSAGE_BYTES (512 /*bits*/ / 8) #define SHA_WORKSPACE_WORDS 16 diff --git a/lib/halfmd4.c b/lib/halfmd4.c index 66d0ee8b7776..a8fe6274a13c 100644 --- a/lib/halfmd4.c +++ b/lib/halfmd4.c @@ -1,4 +1,4 @@ -#include +#include #include #include -- cgit v1.2.3 From 443ab752f1f4fd41a8e05e387c3915670e74b969 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:18 +1100 Subject: lib/bitmap.c: change prototype of bitmap_copy_le Make the prototype of bitmap_copy_le the same as bitmap_copy's. All other bitmap_* functions take unsigned long* parameters; there's no reason this should be special. The only current user is the static inline uwb_mas_bm_copy_le, which already does the void* laundering, so the end users can pass their u8 or __le32 buffers without a cast. Furthermore, this allows us to simply let bitmap_copy_le be an alias for bitmap_copy on little-endian; see next patch. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 5f5c00de39f0..334fe32d8f0e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -170,7 +170,7 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); -extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); +extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); extern int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); diff --git a/lib/bitmap.c b/lib/bitmap.c index ad161a6c82db..e4ac20bec76c 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1191,16 +1191,15 @@ EXPORT_SYMBOL(bitmap_allocate_region); * * Require nbits % BITS_PER_LONG == 0. */ -void bitmap_copy_le(void *dst, const unsigned long *src, int nbits) +void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits) { - unsigned long *d = dst; - int i; + unsigned int i; for (i = 0; i < nbits/BITS_PER_LONG; i++) { if (BITS_PER_LONG == 64) - d[i] = cpu_to_le64(src[i]); + dst[i] = cpu_to_le64(src[i]); else - d[i] = cpu_to_le32(src[i]); + dst[i] = cpu_to_le32(src[i]); } } EXPORT_SYMBOL(bitmap_copy_le); -- cgit v1.2.3 From 0b5b42687a375ccb45c2d733c648c7765203c295 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:18 +1100 Subject: lib/bitmap.c: elide bitmap_copy_le on little-endian On little-endian, there's no reason to have an extra, presumably less efficient, way of copying a bitmap. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 4 ++++ lib/bitmap.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 334fe32d8f0e..cffc89c23c02 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -170,7 +170,11 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); +#ifdef __BIG_ENDIAN extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); +#else +#define bitmap_copy_le bitmap_copy +#endif extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); extern int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); diff --git a/lib/bitmap.c b/lib/bitmap.c index e4ac20bec76c..d2cd50cd4f5d 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1191,6 +1191,7 @@ EXPORT_SYMBOL(bitmap_allocate_region); * * Require nbits % BITS_PER_LONG == 0. */ +#ifdef __BIG_ENDIAN void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int i; @@ -1203,3 +1204,4 @@ void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int n } } EXPORT_SYMBOL(bitmap_copy_le); +#endif -- cgit v1.2.3 From 1ad48ccd3f6c754e47a499ffeea6a13c9fe48954 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:19 +1100 Subject: lib: bitmap: change bitmap_shift_right to take unsigned parameters I've previously changed the nbits parameter of most bitmap_* functions to unsigned; now it is bitmap_shift_{left,right}'s turn. This alone saves some .text, but while at it I found that there were a few other things one could do. The end result of these seven patches is $ scripts/bloat-o-meter /tmp/bitmap.o.{old,new} add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-328 (-328) function old new delta __bitmap_shift_right 384 226 -158 __bitmap_shift_left 306 136 -170 and less importantly also a smaller stack footprint $ stack-o-meter.pl master bitmap file function old new delta lib/bitmap.o __bitmap_shift_right 24 8 -16 lib/bitmap.o __bitmap_shift_left 24 0 -24 For each pair of 0 <= shift <= nbits <= 256 I've tested the end result with a few randomly filled src buffers (including garbage beyond nbits), in each case verifying that the shift {left,right}-most bits of dst are zero and the remaining nbits-shift bits correspond to src, so I'm fairly confident I didn't screw up. That hasn't stopped me from being wrong before, though. This patch (of 7): gcc can generate slightly better code for stuff like "nbits % BITS_PER_LONG" when it knows nbits is not negative. Since negative size bitmaps or shift amounts don't make sense, change these parameters of bitmap_shift_right to unsigned. The expressions involving "lim - 1" are still ok, since if lim is 0 the loop is never executed. Also use "shift" and "nbits" consistently for the parameter names. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 12 ++++++------ lib/bitmap.c | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index cffc89c23c02..c168a807ab9a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -96,8 +96,8 @@ extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); -extern void __bitmap_shift_right(unsigned long *dst, - const unsigned long *src, int shift, int bits); +extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits); extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, @@ -313,13 +313,13 @@ static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) return __bitmap_weight(src, nbits); } -static inline void bitmap_shift_right(unsigned long *dst, - const unsigned long *src, int n, int nbits) +static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned int shift, int nbits) { if (small_const_nbits(nbits)) - *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> n; + *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift; else - __bitmap_shift_right(dst, src, n, nbits); + __bitmap_shift_right(dst, src, shift, nbits); } static inline void bitmap_shift_left(unsigned long *dst, diff --git a/lib/bitmap.c b/lib/bitmap.c index d2cd50cd4f5d..45e7d14ebdfd 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -104,17 +104,17 @@ EXPORT_SYMBOL(__bitmap_complement); * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits - * @bits : bitmap size, in bits + * @nbits : bitmap size, in bits * * Shifting right (dividing) means moving bits in the MS -> LS bit * direction. Zeros are fed into the vacated MS positions and the * LS bits shifted off the bottom are lost. */ -void __bitmap_shift_right(unsigned long *dst, - const unsigned long *src, int shift, int bits) +void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned shift, unsigned nbits) { - int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG; - int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; + unsigned k, lim = BITS_TO_LONGS(nbits), left = nbits % BITS_PER_LONG; + unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; unsigned long mask = (1UL << left) - 1; for (k = 0; off + k < lim; ++k) { unsigned long upper, lower; -- cgit v1.2.3 From 0b4f5af90597ad7cc5b96dfe797968dc94fca925 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Jan 2015 13:12:19 +1100 Subject: lib: bitmap: change bitmap_shift_left to take unsigned parameters gcc can generate slightly better code for stuff like "nbits % BITS_PER_LONG" when it knows nbits is not negative. Since negative size bitmaps or shift amounts don't make sense, change these parameters of bitmap_shift_right to unsigned. If off >= lim (which requires shift >= nbits), k is initialized with a large positive value, but since I've let k continue to be signed, the loop will never run and dst will be zeroed as expected. Inside the loop, k is guaranteed to be non-negative, so the fact that it is promoted to unsigned in the various expressions it appears in is harmless. Also use "shift" and "nbits" consistently for the parameter names. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 12 ++++++------ lib/bitmap.c | 11 ++++++----- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index c168a807ab9a..5e7f75a6d7d0 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -98,8 +98,8 @@ extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); -extern void __bitmap_shift_left(unsigned long *dst, - const unsigned long *src, int shift, int bits); +extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, @@ -322,13 +322,13 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s __bitmap_shift_right(dst, src, shift, nbits); } -static inline void bitmap_shift_left(unsigned long *dst, - const unsigned long *src, int n, int nbits) +static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) - *dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits); + *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits); else - __bitmap_shift_left(dst, src, n, nbits); + __bitmap_shift_left(dst, src, shift, nbits); } static inline int bitmap_parse(const char *buf, unsigned int buflen, diff --git a/lib/bitmap.c b/lib/bitmap.c index db88512c3451..74bdf3601245 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -148,18 +148,19 @@ EXPORT_SYMBOL(__bitmap_shift_right); * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits - * @bits : bitmap size, in bits + * @nbits : bitmap size, in bits * * Shifting left (multiplying) means moving bits in the LS -> MS * direction. Zeros are fed into the vacated LS bit positions * and those MS bits shifted off the top are lost. */ -void __bitmap_shift_left(unsigned long *dst, - const unsigned long *src, int shift, int bits) +void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits) { - int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG; - int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; + int k; + unsigned int lim = BITS_TO_LONGS(nbits), left = nbits % BITS_PER_LONG; + unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; for (k = lim - off - 1; k >= 0; --k) { unsigned long upper, lower; -- cgit v1.2.3 From 9f5b2beccb0ad1133b301b9bfec7c0facd18dbef Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Fri, 30 Jan 2015 13:12:20 +1100 Subject: mm/util: add kstrdup_const kstrdup() is often used to duplicate strings where neither source neither destination will be ever modified. In such case we can just reuse the source instead of duplicating it. The problem is that we must be sure that the source is non-modifiable and its life-time is long enough. I suspect the good candidates for such strings are strings located in kernel .rodata section, they cannot be modifed because the section is read-only and their life-time is equal to kernel life-time. This small patchset proposes alternative version of kstrdup - kstrdup_const, which returns source string if it is located in .rodata otherwise it fallbacks to kstrdup. To verify if the source is in .rodata function checks if the address is between sentinels __start_rodata, __end_rodata. I guess it should work with all architectures. The main patch is accompanied by four patches constifying kstrdup for cases where situtation described above happens frequently. I have tested the patchset on mobile platform (exynos4210-trats) and it saves 3272 string allocations. Since minimal allocation is 32 or 64 bytes depending on Kconfig options the patchset saves respectively about 100KB or 200KB of memory. Stats from tested platform show that the main offender is sysfs: By caller: 2260 __kernfs_new_node 631 clk_register+0xc8/0x1b8 318 clk_register+0x34/0x1b8 51 kmem_cache_create 12 alloc_vfsmnt By string (with count >= 5): 883 power 876 subsystem 135 parameters 132 device 61 iommu_group ... This patch (of 5): Add an alternative version of kstrdup which returns pointer to constant char array. The function checks if input string is in persistent and read-only memory section, if yes it returns the input string, otherwise it fallbacks to kstrdup. kstrdup_const is accompanied by kfree_const performing conditional memory deallocation of the string. Signed-off-by: Andrzej Hajda Cc: Marek Szyprowski Cc: Kyungmin Park Cc: Mike Turquette Cc: Alexander Viro Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Cc: Greg KH Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton --- include/linux/string.h | 3 +++ mm/util.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index b9bc9a5d9e21..e40099e585c9 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -112,7 +112,10 @@ extern void * memchr(const void *,int,__kernel_size_t); #endif void *memchr_inv(const void *s, int c, size_t n); +extern void kfree_const(const void *x); + extern char *kstrdup(const char *s, gfp_t gfp); +extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); diff --git a/mm/util.c b/mm/util.c index f3ef639c4857..3981ae9d1b15 100644 --- a/mm/util.c +++ b/mm/util.c @@ -12,10 +12,30 @@ #include #include +#include #include #include "internal.h" +static inline int is_kernel_rodata(unsigned long addr) +{ + return addr >= (unsigned long)__start_rodata && + addr < (unsigned long)__end_rodata; +} + +/** + * kfree_const - conditionally free memory + * @x: pointer to the memory + * + * Function calls kfree only if @x is not in .rodata section. + */ +void kfree_const(const void *x) +{ + if (!is_kernel_rodata((unsigned long)x)) + kfree(x); +} +EXPORT_SYMBOL(kfree_const); + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -37,6 +57,24 @@ char *kstrdup(const char *s, gfp_t gfp) } EXPORT_SYMBOL(kstrdup); +/** + * kstrdup_const - conditionally duplicate an existing const string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Function returns source string if it is in .rodata section otherwise it + * fallbacks to kstrdup. + * Strings allocated by kstrdup_const should be freed by kfree_const. + */ +const char *kstrdup_const(const char *s, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)s)) + return s; + + return kstrdup(s, gfp); +} +EXPORT_SYMBOL(kstrdup_const); + /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate -- cgit v1.2.3 From 376b6b20e59ff4c539bb74fd39a8581e872fe8ea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2015 13:12:21 +1100 Subject: kernfs: remove KERNFS_STATIC_NAME When a new kernfs node is created, KERNFS_STATIC_NAME is used to avoid making a separate copy of its name. It's currently only used for sysfs attributes whose filenames are required to stay accessible and unchanged. There are rare exceptions where these names are allocated and formatted dynamically but for the vast majority of cases they're consts in the rodata section. Now that kernfs is converted to use kstrdup_const() and kfree_const(), there's little point in keeping KERNFS_STATIC_NAME around. Remove it. Signed-off-by: Tejun Heo Cc: Andrzej Hajda Signed-off-by: Andrew Morton --- fs/kernfs/dir.c | 20 ++++++++------------ fs/kernfs/file.c | 4 ---- fs/sysfs/file.c | 2 +- include/linux/kernfs.h | 7 ++----- kernel/cgroup.c | 2 +- 5 files changed, 12 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 35e40879860a..6acc9648f986 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -411,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); - if (!(kn->flags & KERNFS_STATIC_NAME)) - kfree_const(kn->name); + + kfree_const(kn->name); + if (kn->iattr) { if (kn->iattr->ia_secdata) security_release_secctx(kn->iattr->ia_secdata, @@ -506,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, unsigned flags) { - const char *dup_name = NULL; struct kernfs_node *kn; int ret; - if (!(flags & KERNFS_STATIC_NAME)) { - name = dup_name = kstrdup_const(name, GFP_KERNEL); - if (!name) - return NULL; - } + name = kstrdup_const(name, GFP_KERNEL); + if (!name) + return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) @@ -538,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: - kfree_const(dup_name); + kfree_const(name); return NULL; } @@ -1285,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kn->ns = new_ns; if (new_name) { - if (!(kn->flags & KERNFS_STATIC_NAME)) - old_name = kn->name; - kn->flags &= ~KERNFS_STATIC_NAME; + old_name = kn->name; kn->name = new_name; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ddc9f9612f16..b684e8a132e6 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = { * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file - * @name_is_static: don't copy file name * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. @@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, - bool name_is_static, struct lock_class_key *key) { struct kernfs_node *kn; @@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, int rc; flags = KERNFS_FILE; - if (name_is_static) - flags |= KERNFS_STATIC_NAME; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); if (!kn) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index dfe928a9540f..7c2867b44141 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, - (void *)attr, ns, true, key); + (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index d4e01b358341..71ecdab1671b 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -43,7 +43,6 @@ enum kernfs_node_flag { KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, - KERNFS_STATIC_NAME = 0x0200, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, }; @@ -291,7 +290,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, - bool name_is_static, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, @@ -369,8 +367,7 @@ kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, - void *priv, const void *ns, bool name_is_static, - struct lock_class_key *key) + void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * @@ -439,7 +436,7 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name, key = (struct lock_class_key *)&ops->lockdep_key; #endif return __kernfs_create_file(parent, name, mode, size, ops, priv, ns, - false, key); + key); } static inline struct kernfs_node * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5f6ec251fb2..29a7b2cc593e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), 0, cft->kf_ops, cft, - NULL, false, key); + NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); -- cgit v1.2.3 From 7fd7a380ece43bb98ee2404d6befabe8553c8aae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2015 13:12:22 +1100 Subject: cpumask: always use nr_cpu_ids in formatting and parsing functions bitmap implements two variants of scnprintf functions to format a bitmap into a string and cpumask and nodemask wrap them to provide equivalent interfaces. The scnprintf family of functions require a string buffer as an output target which complicates code paths which just want to print out the mask through printk for informational or debug purposes as they have to worry about how large the buffer should be and whether it's too large to allocate on stack. Neither cpumask or nodemask provides a guildeline on how large the target buffer should be forcing users come up with their own solutions - some allocate an arbitrarily sized buffer which is small enough to allocate on stack but may be too short in corner cases, other come up with a custom upper limit calculation considering the output format, some allocate the buffer dynamically while one resorted to using lock to synchronize access to a static buffer. This is an artificial problem which is being solved repeatedly for no benefit. In a lot of cases, the output area already exists and can be targeted directly making the intermediate buffer unnecessary. This patchset teaches printf family of functions how to format bitmaps and replace the dedicated formatting functions with it. Pointer formatting is extended to cover bitmap formatting. It uses the field width for the number of bits instead of precision. The format used is '%*pb[l]', with the optional trailing 'l' specifying list format instead of hex masks. For more details, please see 0002. This patch (of 31): Currently, the formatting and parsing functions in cpumask.h use nr_cpumask_bits like other cpumask functions; however, nr_cpumask_bits is either NR_CPUS or nr_cpu_ids depending on CONFIG_CPUMASK_OFFSTACK. This leads to inconsistent behaviors. With CONFIG_NR_CPUS=512 and !CONFIG_CPUMASK_OFFSTACK # cat /sys/devices/virtual/net/lo/queues/rx-0/rps_cpus 00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000 # cat /proc/self/status | grep Cpus_allowed: Cpus_allowed: f With CONFIG_NR_CPUS=1024 and CONFIG_CPUMASK_OFFSTACK (fedora default) # cat /sys/devices/virtual/net/lo/queues/rx-0/rps_cpus 0 # cat /proc/self/status | grep Cpus_allowed: Cpus_allowed: f Note that /proc/self/status is always using nr_cpu_ids regardless of config. This is because seq cpumask formattings functions always use nr_cpu_ids. Given that the same output fields may switch between the two forms, converging on nr_cpu_ids always isn't too likely to surprise userland. This patch updates the formatting and parsing functions in cpumask.h to always use nr_cpu_ids. There's no point in dealing with CPUs which aren't even possible on the machine. Signed-off-by: Tejun Heo Cc: "David S. Miller" Cc: "James E.J. Bottomley" Cc: "John W. Linville" Cc: "Paul E. McKenney" Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Chris Zankel Cc: Christoph Lameter Cc: Dmitry Torokhov Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Li Zefan Cc: Max Filippov Cc: Mike Travis Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Russell King Acked-by: Rusty Russell Cc: Steffen Klassert Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff9044286d88..ee9acb0ce542 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -550,7 +550,7 @@ static inline void cpumask_copy(struct cpumask *dstp, static inline int cpumask_scnprintf(char *buf, int len, const struct cpumask *srcp) { - return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpumask_bits); + return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpu_ids); } /** @@ -564,7 +564,7 @@ static inline int cpumask_scnprintf(char *buf, int len, static inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { - return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); + return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpu_ids); } /** @@ -579,7 +579,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), - nr_cpumask_bits); + nr_cpu_ids); } /** @@ -595,7 +595,7 @@ static inline int cpulist_scnprintf(char *buf, int len, const struct cpumask *srcp) { return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp), - nr_cpumask_bits); + nr_cpu_ids); } /** @@ -610,7 +610,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) char *nl = strchr(buf, '\n'); unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf); - return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); + return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpu_ids); } /** @@ -622,7 +622,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) */ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) { - return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); + return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpu_ids); } /** @@ -817,7 +817,7 @@ static inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), - nr_cpumask_bits); + nr_cpu_ids); } /* -- cgit v1.2.3 From 143bbc314f82c0be58d0fc119431f48c6fb92212 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2015 13:12:22 +1100 Subject: cpumask, nodemask: implement cpumask/nodemask_pr_args() printf family of functions can now format bitmaps using '%*pb[l]' and all cpumask and nodemask formatting will be converted to use it. To ease printing these masks with '%*pb[l]' which require two params - the number of bits and the actual bitmap, this patch implement cpumask_pr_args() and nodemask_pr_args() which can be used to provide arguments for '%*pb[l]' Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: "David S. Miller" Cc: "James E.J. Bottomley" Cc: "John W. Linville" Cc: "Paul E. McKenney" Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Chris Zankel Cc: Christoph Lameter Cc: Dmitry Torokhov Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Li Zefan Cc: Max Filippov Cc: Mike Travis Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Russell King Cc: Steffen Klassert Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 8 ++++++++ include/linux/nodemask.h | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ee9acb0ce542..a9b3d00915a0 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -22,6 +22,14 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; */ #define cpumask_bits(maskp) ((maskp)->bits) +/** + * cpumask_pr_args - printf args to output a cpumask + * @maskp: cpumask to be printed + * + * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. + */ +#define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) + #if NR_CPUS == 1 #define nr_cpu_ids 1 #else diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 21cef483dc1b..10f8e556ba07 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -98,6 +98,14 @@ typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; +/** + * nodemask_pr_args - printf args to output a nodemask + * @maskp: nodemask to be printed + * + * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. + */ +#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits + /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions -- cgit v1.2.3 From 32c720d8ef6faad6a46e8580b2032c5eb39cd6cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2015 13:12:28 +1100 Subject: bitmap, cpumask, nodemask: remove dedicated formatting functions Now that all bitmap formatting usages have been converted to '%*pb[l]', the separate formatting functions are unnecessary. The following functions are removed. * bitmap_scn[list]printf() * cpumask_scnprintf(), cpulist_scnprintf() * [__]nodemask_scnprintf(), [__]nodelist_scnprintf() * seq_bitmap[_list](), seq_cpumask[_list](), seq_nodemask[_list]() * seq_buf_bitmask() Signed-off-by: Tejun Heo Cc: Rusty Russell Signed-off-by: Andrew Morton --- fs/seq_file.c | 32 -------------------------------- include/linux/bitmap.h | 7 ------- include/linux/cpumask.h | 31 ------------------------------- include/linux/nodemask.h | 33 +++++++-------------------------- include/linux/seq_buf.h | 3 --- include/linux/seq_file.h | 25 ------------------------- lib/bitmap.c | 41 ----------------------------------------- lib/seq_buf.c | 36 ------------------------------------ 8 files changed, 7 insertions(+), 201 deletions(-) (limited to 'include') diff --git a/fs/seq_file.c b/fs/seq_file.c index dbf3a59c86bb..555f82155be8 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } -int seq_bitmap(struct seq_file *m, const unsigned long *bits, - unsigned int nr_bits) -{ - if (m->count < m->size) { - int len = bitmap_scnprintf(m->buf + m->count, - m->size - m->count, bits, nr_bits); - if (m->count + len < m->size) { - m->count += len; - return 0; - } - } - seq_set_overflow(m); - return -1; -} -EXPORT_SYMBOL(seq_bitmap); - -int seq_bitmap_list(struct seq_file *m, const unsigned long *bits, - unsigned int nr_bits) -{ - if (m->count < m->size) { - int len = bitmap_scnlistprintf(m->buf + m->count, - m->size - m->count, bits, nr_bits); - if (m->count + len < m->size) { - m->count += len; - return 0; - } - } - seq_set_overflow(m); - return -1; -} -EXPORT_SYMBOL(seq_bitmap_list); - static void *single_start(struct seq_file *p, loff_t *pos) { return NULL + (*pos == 0); diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 5e7f75a6d7d0..dbfbf4990005 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -52,16 +52,13 @@ * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz - * bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf - * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region - * bitmap_print_to_pagebuf(list, buf, mask, nbits) Print bitmap src as list/hex */ /* @@ -147,14 +144,10 @@ bitmap_find_next_zero_area(unsigned long *map, align_mask, 0); } -extern int bitmap_scnprintf(char *buf, unsigned int len, - const unsigned long *src, int nbits); extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, unsigned long *dst, int nbits); extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); -extern int bitmap_scnlistprintf(char *buf, unsigned int len, - const unsigned long *src, int nbits); extern int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits); extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index a9b3d00915a0..086549a665e2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -546,21 +546,6 @@ static inline void cpumask_copy(struct cpumask *dstp, */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) -/** - * cpumask_scnprintf - print a cpumask into a string as comma-separated hex - * @buf: the buffer to sprintf into - * @len: the length of the buffer - * @srcp: the cpumask to print - * - * If len is zero, returns zero. Otherwise returns the length of the - * (nul-terminated) @buf string. - */ -static inline int cpumask_scnprintf(char *buf, int len, - const struct cpumask *srcp) -{ - return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpu_ids); -} - /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from @@ -590,22 +575,6 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, nr_cpu_ids); } -/** - * cpulist_scnprintf - print a cpumask into a string as comma-separated list - * @buf: the buffer to sprintf into - * @len: the length of the buffer - * @srcp: the cpumask to print - * - * If len is zero, returns zero. Otherwise returns the length of the - * (nul-terminated) @buf string. - */ -static inline int cpulist_scnprintf(char *buf, int len, - const struct cpumask *srcp) -{ - return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp), - nr_cpu_ids); -} - /** * cpumask_parse - extract a cpumask from from a string * @buf: the buffer to extract from diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 10f8e556ba07..6e85889cf9ab 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -8,14 +8,13 @@ * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * - * For details of nodemask_scnprintf() and nodemask_parse_user(), - * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c. - * For details of nodelist_scnprintf() and nodelist_parse(), see - * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c. - * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c. - * For details of nodes_remap(), see bitmap_remap in lib/bitmap.c. - * For details of nodes_onto(), see bitmap_onto in lib/bitmap.c. - * For details of nodes_fold(), see bitmap_fold in lib/bitmap.c. + * For details of nodemask_parse_user(), see bitmap_parse_user() in + * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), + * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in + * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in + * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in + * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in + * lib/bitmap.c. * * The available nodemask operations are: * @@ -52,9 +51,7 @@ * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * - * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask - * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) @@ -312,14 +309,6 @@ static inline int __first_unset_node(const nodemask_t *maskp) #define nodes_addr(src) ((src).bits) -#define nodemask_scnprintf(buf, len, src) \ - __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) -static inline int __nodemask_scnprintf(char *buf, int len, - const nodemask_t *srcp, int nbits) -{ - return bitmap_scnprintf(buf, len, srcp->bits, nbits); -} - #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static inline int __nodemask_parse_user(const char __user *buf, int len, @@ -328,14 +317,6 @@ static inline int __nodemask_parse_user(const char __user *buf, int len, return bitmap_parse_user(buf, len, dstp->bits, nbits); } -#define nodelist_scnprintf(buf, len, src) \ - __nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES) -static inline int __nodelist_scnprintf(char *buf, int len, - const nodemask_t *srcp, int nbits) -{ - return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); -} - #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 9aafe0e24c68..fb7eb9ccb1cd 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -125,9 +125,6 @@ extern int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, unsigned int len); extern int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc); -extern int seq_buf_bitmask(struct seq_buf *s, const unsigned long *maskp, - int nmaskbits); - #ifdef CONFIG_BINARY_PRINTF extern int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index cf6a9daaaf6d..afbb1fd77c77 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -126,31 +126,6 @@ int seq_path(struct seq_file *, const struct path *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); -int seq_bitmap(struct seq_file *m, const unsigned long *bits, - unsigned int nr_bits); -static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask) -{ - return seq_bitmap(m, cpumask_bits(mask), nr_cpu_ids); -} - -static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask) -{ - return seq_bitmap(m, mask->bits, MAX_NUMNODES); -} - -int seq_bitmap_list(struct seq_file *m, const unsigned long *bits, - unsigned int nr_bits); - -static inline int seq_cpumask_list(struct seq_file *m, - const struct cpumask *mask) -{ - return seq_bitmap_list(m, cpumask_bits(mask), nr_cpu_ids); -} - -static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask) -{ - return seq_bitmap_list(m, mask->bits, MAX_NUMNODES); -} int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t); diff --git a/lib/bitmap.c b/lib/bitmap.c index 088adbdcbad9..d456f4c15a9f 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -369,24 +369,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area_off); #define nbits_to_hold_value(val) fls(val) #define BASEDEC 10 /* fancier cpuset lists input in decimal */ -/** - * bitmap_scnprintf - convert bitmap to an ASCII hex string. - * @buf: byte buffer into which string is placed - * @buflen: reserved size of @buf, in bytes - * @maskp: pointer to bitmap to convert - * @nmaskbits: size of bitmap, in bits - * - * Exactly @nmaskbits bits are displayed. Hex digits are grouped into - * comma-separated sets of eight digits per set. Returns the number of - * characters which were written to *buf, excluding the trailing \0. - */ -int bitmap_scnprintf(char *buf, unsigned int buflen, - const unsigned long *maskp, int nmaskbits) -{ - return scnprintf(buf, buflen, "%*pb", nmaskbits, maskp); -} -EXPORT_SYMBOL(bitmap_scnprintf); - /** * __bitmap_parse - convert an ASCII hex string into a bitmap. * @buf: pointer to buffer containing string. @@ -500,29 +482,6 @@ int bitmap_parse_user(const char __user *ubuf, } EXPORT_SYMBOL(bitmap_parse_user); -/** - * bitmap_scnlistprintf - convert bitmap to list format ASCII string - * @buf: byte buffer into which string is placed - * @buflen: reserved size of @buf, in bytes - * @maskp: pointer to bitmap to convert - * @nmaskbits: size of bitmap, in bits - * - * Output format is a comma-separated list of decimal numbers and - * ranges. Consecutively set bits are shown as two hyphen-separated - * decimal numbers, the smallest and largest bit numbers set in - * the range. Output format is compatible with the format - * accepted as input by bitmap_parselist(). - * - * The return value is the number of characters which were written to *buf - * excluding the trailing '\0', as per ISO C99's scnprintf. - */ -int bitmap_scnlistprintf(char *buf, unsigned int buflen, - const unsigned long *maskp, int nmaskbits) -{ - return scnprintf(buf, buflen, "%*pbl", nmaskbits, maskp); -} -EXPORT_SYMBOL(bitmap_scnlistprintf); - /** * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 4eedfedb9e31..88c0854bd752 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -91,42 +91,6 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) return ret; } -/** - * seq_buf_bitmask - write a bitmask array in its ASCII representation - * @s: seq_buf descriptor - * @maskp: points to an array of unsigned longs that represent a bitmask - * @nmaskbits: The number of bits that are valid in @maskp - * - * Writes a ASCII representation of a bitmask string into @s. - * - * Returns zero on success, -1 on overflow. - */ -int seq_buf_bitmask(struct seq_buf *s, const unsigned long *maskp, - int nmaskbits) -{ - unsigned int len = seq_buf_buffer_left(s); - int ret; - - WARN_ON(s->size == 0); - - /* - * Note, because bitmap_scnprintf() only returns the number of bytes - * written and not the number that would be written, we use the last - * byte of the buffer to let us know if we overflowed. There's a small - * chance that the bitmap could have fit exactly inside the buffer, but - * it's not that critical if that does happen. - */ - if (len > 1) { - ret = bitmap_scnprintf(s->buffer + s->len, len, maskp, nmaskbits); - if (ret < len) { - s->len += ret; - return 0; - } - } - seq_buf_set_overflow(s); - return -1; -} - #ifdef CONFIG_BINARY_PRINTF /** * seq_buf_bprintf - Write the printf string from binary arguments -- cgit v1.2.3 From 3770abd60d3555a6e97f5e2c56bd30c14042995e Mon Sep 17 00:00:00 2001 From: Sebastian Capella Date: Fri, 30 Jan 2015 13:12:28 +1100 Subject: mm/util.c: add kstrimdup() kstrimdup() creates a whitespace-trimmed duplicate of the passed in null-terminated string. This is useful for strings coming from sysfs that often include trailing whitespace due to user input. Thanks to Joe Perches for this implementation. Signed-off-by: Sebastian Capella Cc: Joe Perches Acked-by: David Rientjes Signed-off-by: Andrew Morton --- include/linux/string.h | 1 + mm/util.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index e40099e585c9..12a5a60f1f3b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -117,6 +117,7 @@ extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp); extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); +extern char *kstrimdup(const char *s, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/mm/util.c b/mm/util.c index 3981ae9d1b15..d68339206100 100644 --- a/mm/util.c +++ b/mm/util.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -99,6 +100,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) } EXPORT_SYMBOL(kstrndup); +/** + * kstrimdup - Trim and copy a %NUL terminated string. + * @s: the string to trim and duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Returns an address, which the caller must kfree, containing + * a duplicate of the passed string with leading and/or trailing + * whitespace (as defined by isspace) removed. + */ +char *kstrimdup(const char *s, gfp_t gfp) +{ + char *buf; + char *begin = skip_spaces(s); + size_t len = strlen(begin); + + while (len && isspace(begin[len - 1])) + len--; + + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) + return NULL; + + memcpy(buf, begin, len); + buf[len] = '\0'; + + return buf; +} +EXPORT_SYMBOL(kstrimdup); + /** * kmemdup - duplicate region of memory * -- cgit v1.2.3 From 5fb9e153154d04015452efba1982f3c7a15f30b2 Mon Sep 17 00:00:00 2001 From: Marian Chereji Date: Fri, 30 Jan 2015 13:12:29 +1100 Subject: lib: Add CRC64 ECMA module Add implementation of CRC64 ECMA checksum. We have an IP Acceleration driver for Freescale network processors which is using this CRC64. However, it still needs some work in order for it to become upstreamable. Signed-off-by: Marian Chereji Reviewed-by: Varvara Andrei-B21317 Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Andrew Morton --- include/linux/crc64_ecma.h | 56 ++++++++ lib/Kconfig | 7 + lib/Makefile | 1 + lib/crc64_ecma.c | 341 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 include/linux/crc64_ecma.h create mode 100644 lib/crc64_ecma.c (limited to 'include') diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h new file mode 100644 index 000000000000..bba7a4d692b3 --- /dev/null +++ b/include/linux/crc64_ecma.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CRC64_ECMA_H_ +#define __CRC64_ECMA_H_ + +#include + + +#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * @pdata: pointer to the data to compute checksum for. + * @nbytes: number of bytes in data buffer. + * @seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed); + +#endif /* __CRC64_ECMA_H_ */ diff --git a/lib/Kconfig b/lib/Kconfig index 54cf309a92a5..2faf7b2de5b3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -180,6 +180,13 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config CRC64_ECMA + tristate "CRC64 ECMA function" + help + This option provides CRC64 ECMA function. Drivers may select this + when they need to do cyclic redundancy check according to the CRC64 + ECMA algorithm. + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH diff --git a/lib/Makefile b/lib/Makefile index 2ca019ec931c..3dca86fa7dc9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -74,6 +74,7 @@ obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c new file mode 100644 index 000000000000..41629ea5a60c --- /dev/null +++ b/lib/crc64_ecma.c @@ -0,0 +1,341 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + + +#define CRC64_BYTE_MASK 0xFF +#define CRC64_TABLE_SIZE 256 + + +struct crc64_table { + u64 seed; + u64 table[CRC64_TABLE_SIZE]; +}; + + +static struct crc64_table CRC64_ECMA_182 = { + CRC64_DEFAULT_INITVAL, + { + 0x0000000000000000ULL, + 0xb32e4cbe03a75f6fULL, + 0xf4843657a840a05bULL, + 0x47aa7ae9abe7ff34ULL, + 0x7bd0c384ff8f5e33ULL, + 0xc8fe8f3afc28015cULL, + 0x8f54f5d357cffe68ULL, + 0x3c7ab96d5468a107ULL, + 0xf7a18709ff1ebc66ULL, + 0x448fcbb7fcb9e309ULL, + 0x0325b15e575e1c3dULL, + 0xb00bfde054f94352ULL, + 0x8c71448d0091e255ULL, + 0x3f5f08330336bd3aULL, + 0x78f572daa8d1420eULL, + 0xcbdb3e64ab761d61ULL, + 0x7d9ba13851336649ULL, + 0xceb5ed8652943926ULL, + 0x891f976ff973c612ULL, + 0x3a31dbd1fad4997dULL, + 0x064b62bcaebc387aULL, + 0xb5652e02ad1b6715ULL, + 0xf2cf54eb06fc9821ULL, + 0x41e11855055bc74eULL, + 0x8a3a2631ae2dda2fULL, + 0x39146a8fad8a8540ULL, + 0x7ebe1066066d7a74ULL, + 0xcd905cd805ca251bULL, + 0xf1eae5b551a2841cULL, + 0x42c4a90b5205db73ULL, + 0x056ed3e2f9e22447ULL, + 0xb6409f5cfa457b28ULL, + 0xfb374270a266cc92ULL, + 0x48190ecea1c193fdULL, + 0x0fb374270a266cc9ULL, + 0xbc9d3899098133a6ULL, + 0x80e781f45de992a1ULL, + 0x33c9cd4a5e4ecdceULL, + 0x7463b7a3f5a932faULL, + 0xc74dfb1df60e6d95ULL, + 0x0c96c5795d7870f4ULL, + 0xbfb889c75edf2f9bULL, + 0xf812f32ef538d0afULL, + 0x4b3cbf90f69f8fc0ULL, + 0x774606fda2f72ec7ULL, + 0xc4684a43a15071a8ULL, + 0x83c230aa0ab78e9cULL, + 0x30ec7c140910d1f3ULL, + 0x86ace348f355aadbULL, + 0x3582aff6f0f2f5b4ULL, + 0x7228d51f5b150a80ULL, + 0xc10699a158b255efULL, + 0xfd7c20cc0cdaf4e8ULL, + 0x4e526c720f7dab87ULL, + 0x09f8169ba49a54b3ULL, + 0xbad65a25a73d0bdcULL, + 0x710d64410c4b16bdULL, + 0xc22328ff0fec49d2ULL, + 0x85895216a40bb6e6ULL, + 0x36a71ea8a7ace989ULL, + 0x0adda7c5f3c4488eULL, + 0xb9f3eb7bf06317e1ULL, + 0xfe5991925b84e8d5ULL, + 0x4d77dd2c5823b7baULL, + 0x64b62bcaebc387a1ULL, + 0xd7986774e864d8ceULL, + 0x90321d9d438327faULL, + 0x231c512340247895ULL, + 0x1f66e84e144cd992ULL, + 0xac48a4f017eb86fdULL, + 0xebe2de19bc0c79c9ULL, + 0x58cc92a7bfab26a6ULL, + 0x9317acc314dd3bc7ULL, + 0x2039e07d177a64a8ULL, + 0x67939a94bc9d9b9cULL, + 0xd4bdd62abf3ac4f3ULL, + 0xe8c76f47eb5265f4ULL, + 0x5be923f9e8f53a9bULL, + 0x1c4359104312c5afULL, + 0xaf6d15ae40b59ac0ULL, + 0x192d8af2baf0e1e8ULL, + 0xaa03c64cb957be87ULL, + 0xeda9bca512b041b3ULL, + 0x5e87f01b11171edcULL, + 0x62fd4976457fbfdbULL, + 0xd1d305c846d8e0b4ULL, + 0x96797f21ed3f1f80ULL, + 0x2557339fee9840efULL, + 0xee8c0dfb45ee5d8eULL, + 0x5da24145464902e1ULL, + 0x1a083bacedaefdd5ULL, + 0xa9267712ee09a2baULL, + 0x955cce7fba6103bdULL, + 0x267282c1b9c65cd2ULL, + 0x61d8f8281221a3e6ULL, + 0xd2f6b4961186fc89ULL, + 0x9f8169ba49a54b33ULL, + 0x2caf25044a02145cULL, + 0x6b055fede1e5eb68ULL, + 0xd82b1353e242b407ULL, + 0xe451aa3eb62a1500ULL, + 0x577fe680b58d4a6fULL, + 0x10d59c691e6ab55bULL, + 0xa3fbd0d71dcdea34ULL, + 0x6820eeb3b6bbf755ULL, + 0xdb0ea20db51ca83aULL, + 0x9ca4d8e41efb570eULL, + 0x2f8a945a1d5c0861ULL, + 0x13f02d374934a966ULL, + 0xa0de61894a93f609ULL, + 0xe7741b60e174093dULL, + 0x545a57dee2d35652ULL, + 0xe21ac88218962d7aULL, + 0x5134843c1b317215ULL, + 0x169efed5b0d68d21ULL, + 0xa5b0b26bb371d24eULL, + 0x99ca0b06e7197349ULL, + 0x2ae447b8e4be2c26ULL, + 0x6d4e3d514f59d312ULL, + 0xde6071ef4cfe8c7dULL, + 0x15bb4f8be788911cULL, + 0xa6950335e42fce73ULL, + 0xe13f79dc4fc83147ULL, + 0x521135624c6f6e28ULL, + 0x6e6b8c0f1807cf2fULL, + 0xdd45c0b11ba09040ULL, + 0x9aefba58b0476f74ULL, + 0x29c1f6e6b3e0301bULL, + 0xc96c5795d7870f42ULL, + 0x7a421b2bd420502dULL, + 0x3de861c27fc7af19ULL, + 0x8ec62d7c7c60f076ULL, + 0xb2bc941128085171ULL, + 0x0192d8af2baf0e1eULL, + 0x4638a2468048f12aULL, + 0xf516eef883efae45ULL, + 0x3ecdd09c2899b324ULL, + 0x8de39c222b3eec4bULL, + 0xca49e6cb80d9137fULL, + 0x7967aa75837e4c10ULL, + 0x451d1318d716ed17ULL, + 0xf6335fa6d4b1b278ULL, + 0xb199254f7f564d4cULL, + 0x02b769f17cf11223ULL, + 0xb4f7f6ad86b4690bULL, + 0x07d9ba1385133664ULL, + 0x4073c0fa2ef4c950ULL, + 0xf35d8c442d53963fULL, + 0xcf273529793b3738ULL, + 0x7c0979977a9c6857ULL, + 0x3ba3037ed17b9763ULL, + 0x888d4fc0d2dcc80cULL, + 0x435671a479aad56dULL, + 0xf0783d1a7a0d8a02ULL, + 0xb7d247f3d1ea7536ULL, + 0x04fc0b4dd24d2a59ULL, + 0x3886b22086258b5eULL, + 0x8ba8fe9e8582d431ULL, + 0xcc0284772e652b05ULL, + 0x7f2cc8c92dc2746aULL, + 0x325b15e575e1c3d0ULL, + 0x8175595b76469cbfULL, + 0xc6df23b2dda1638bULL, + 0x75f16f0cde063ce4ULL, + 0x498bd6618a6e9de3ULL, + 0xfaa59adf89c9c28cULL, + 0xbd0fe036222e3db8ULL, + 0x0e21ac88218962d7ULL, + 0xc5fa92ec8aff7fb6ULL, + 0x76d4de52895820d9ULL, + 0x317ea4bb22bfdfedULL, + 0x8250e80521188082ULL, + 0xbe2a516875702185ULL, + 0x0d041dd676d77eeaULL, + 0x4aae673fdd3081deULL, + 0xf9802b81de97deb1ULL, + 0x4fc0b4dd24d2a599ULL, + 0xfceef8632775faf6ULL, + 0xbb44828a8c9205c2ULL, + 0x086ace348f355aadULL, + 0x34107759db5dfbaaULL, + 0x873e3be7d8faa4c5ULL, + 0xc094410e731d5bf1ULL, + 0x73ba0db070ba049eULL, + 0xb86133d4dbcc19ffULL, + 0x0b4f7f6ad86b4690ULL, + 0x4ce50583738cb9a4ULL, + 0xffcb493d702be6cbULL, + 0xc3b1f050244347ccULL, + 0x709fbcee27e418a3ULL, + 0x3735c6078c03e797ULL, + 0x841b8ab98fa4b8f8ULL, + 0xadda7c5f3c4488e3ULL, + 0x1ef430e13fe3d78cULL, + 0x595e4a08940428b8ULL, + 0xea7006b697a377d7ULL, + 0xd60abfdbc3cbd6d0ULL, + 0x6524f365c06c89bfULL, + 0x228e898c6b8b768bULL, + 0x91a0c532682c29e4ULL, + 0x5a7bfb56c35a3485ULL, + 0xe955b7e8c0fd6beaULL, + 0xaeffcd016b1a94deULL, + 0x1dd181bf68bdcbb1ULL, + 0x21ab38d23cd56ab6ULL, + 0x9285746c3f7235d9ULL, + 0xd52f0e859495caedULL, + 0x6601423b97329582ULL, + 0xd041dd676d77eeaaULL, + 0x636f91d96ed0b1c5ULL, + 0x24c5eb30c5374ef1ULL, + 0x97eba78ec690119eULL, + 0xab911ee392f8b099ULL, + 0x18bf525d915feff6ULL, + 0x5f1528b43ab810c2ULL, + 0xec3b640a391f4fadULL, + 0x27e05a6e926952ccULL, + 0x94ce16d091ce0da3ULL, + 0xd3646c393a29f297ULL, + 0x604a2087398eadf8ULL, + 0x5c3099ea6de60cffULL, + 0xef1ed5546e415390ULL, + 0xa8b4afbdc5a6aca4ULL, + 0x1b9ae303c601f3cbULL, + 0x56ed3e2f9e224471ULL, + 0xe5c372919d851b1eULL, + 0xa26908783662e42aULL, + 0x114744c635c5bb45ULL, + 0x2d3dfdab61ad1a42ULL, + 0x9e13b115620a452dULL, + 0xd9b9cbfcc9edba19ULL, + 0x6a978742ca4ae576ULL, + 0xa14cb926613cf817ULL, + 0x1262f598629ba778ULL, + 0x55c88f71c97c584cULL, + 0xe6e6c3cfcadb0723ULL, + 0xda9c7aa29eb3a624ULL, + 0x69b2361c9d14f94bULL, + 0x2e184cf536f3067fULL, + 0x9d36004b35545910ULL, + 0x2b769f17cf112238ULL, + 0x9858d3a9ccb67d57ULL, + 0xdff2a94067518263ULL, + 0x6cdce5fe64f6dd0cULL, + 0x50a65c93309e7c0bULL, + 0xe388102d33392364ULL, + 0xa4226ac498dedc50ULL, + 0x170c267a9b79833fULL, + 0xdcd7181e300f9e5eULL, + 0x6ff954a033a8c131ULL, + 0x28532e49984f3e05ULL, + 0x9b7d62f79be8616aULL, + 0xa707db9acf80c06dULL, + 0x14299724cc279f02ULL, + 0x5383edcd67c06036ULL, + 0xe0ada17364673f59ULL + } +}; + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void) +{ + return CRC64_ECMA_182.seed; +} +EXPORT_SYMBOL(crc64_ecma_seed); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * pdata: pointer to the data to compute checksum for. + * nbytes: number of bytes in data buffer. + * seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed) +{ + unsigned int i; + u64 crc = seed; + + for (i = 0; i < nbytes; i++) + crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^ + (crc >> 8); + + return crc; +} +EXPORT_SYMBOL(crc64_ecma); + +MODULE_DESCRIPTION("CRC64 ECMA function"); +MODULE_AUTHOR("Freescale Semiconductor Inc."); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From db5ded059da6ee1d47748272d4dae4fbebc069a5 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 30 Jan 2015 13:12:35 +1100 Subject: rtc: restore alarm after resume Some platform firmware may interfere with the RTC alarm over suspend, resulting in the kernel and hardware having different ideas about system state but also potentially causing problems with firmware that assumes the OS will clean this case up. This patch saves the RTC alarm state on suspend and will restore it on resume if the alarm has not yet fired - if it has, it will clear the RTC alarm. The case we've seen is Intel Rapid Start, which is a firmware-mediated feature that automatically transitions systems from suspend-to-RAM to suspend-to-disk without OS involvement. It does this by setting the RTC alarm and a flag that indicates that on wake it should perform the transition rather than re-starting the OS. However, if the OS has set a wakeup alarm that would wake the machine earlier, it refuses to overwrite it and allows the system to wake instead. This fails in the following situation: 1) User configures Intel Rapid Start to transition after (say) 15 minutes 2) User suspends to RAM. Firmware sets the wakeup alarm for 15 minutes in the future 3) User resumes after 5 minutes. Firmware does not reset the alarm, and as such it is still set for 10 minutes in the future 4) User suspends after 5 minutes. Firmware notices that the alarm is set for 5 minutes in the future, which is less than the 15 minute transition threshold. It therefore assumes that the user wants the machine to wake in 5 minutes 5) System resumes after 5 minutes The worst case scenario here is that the user may have put the system in a bag between (4) and (5), resulting in it running in a confined space and potentially overheating. This seems reasonably important. The Rapid Start support code got added in 3.11, but it can be configured in the firmware regardless of kernel support. Signed-off-by: Matthew Garrett Tested-by: Gabriele Mazzotta Cc: Alessandro Zummo Signed-off-by: Andrew Morton --- drivers/rtc/class.c | 24 ++++++++++++++++++++++++ include/linux/rtc.h | 4 ++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 472a5adc4642..c7e09e27ae5d 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -55,6 +55,8 @@ static int rtc_suspend(struct device *dev) struct timespec64 delta, delta_delta; int err; + rtc->valid_alarm = !rtc_read_alarm(rtc, &rtc->alarm); + if (has_persistent_clock()) return 0; @@ -102,6 +104,27 @@ static int rtc_resume(struct device *dev) struct timespec64 sleep_time; int err; + /* + * Ensure that the platform hasn't overwritten a pending alarm while + * suspended + */ + if (rtc->valid_alarm) { + long now, scheduled; + + rtc_read_time(rtc, &tm); + rtc_tm_to_time(&rtc->alarm.time, &scheduled); + rtc_tm_to_time(&tm, &now); + + /* Clear the alarm registers if it went off during suspend */ + if (scheduled <= now) { + rtc_time_to_tm(0, &rtc->alarm.time); + rtc->alarm.enabled = 0; + } + + if (rtc->ops && rtc->ops->set_alarm) + rtc->ops->set_alarm(rtc->dev.parent, &rtc->alarm); + } + if (has_persistent_clock()) return 0; @@ -145,6 +168,7 @@ static int rtc_resume(struct device *dev) if (sleep_time.tv_sec >= 0) timekeeping_inject_sleeptime64(&sleep_time); rtc_hctosys_ret = 0; + return 0; } diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 6d6be09a2fe5..bc805fff00dc 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -133,6 +133,10 @@ struct rtc_device /* Some hardware can't support UIE mode */ int uie_unsupported; +#ifdef CONFIG_PM_SLEEP + struct rtc_wkalrm alarm; + bool valid_alarm; +#endif #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL struct work_struct uie_task; struct timer_list uie_timer; -- cgit v1.2.3 From 90885db8552c28df2f9d523bbfb127f492e94a86 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 30 Jan 2015 13:12:37 +1100 Subject: kexec: remove never used member destination in kimage struct kimage has a member destination which is used to store the real destination address of each page when load segment from user space buffer to kernel. But we never retrieve the value stored in kimage->destination, so this member variable in kimage and its assignment operation are redundent code. I guess for_each_kimage_entry just does the work that kimage->destination is expected to do. So in this patch just make a cleanup to remove it. Signed-off-by: Baoquan He Cc: "Eric W. Biederman" Cc: Vivek Goyal Signed-off-by: Andrew Morton --- include/linux/kexec.h | 2 -- kernel/kexec.c | 4 ---- 2 files changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9d957b7ae095..10da8e246317 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -122,8 +122,6 @@ struct kimage { kimage_entry_t *entry; kimage_entry_t *last_entry; - unsigned long destination; - unsigned long start; struct page *control_code_page; struct page *swap_page; diff --git a/kernel/kexec.c b/kernel/kexec.c index 9a8a01abbaed..7a897ac5a75c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image, destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) - image->destination = destination; return result; } @@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page) page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) - image->destination += PAGE_SIZE; return result; } -- cgit v1.2.3 From af6f986d8a195663a062145780cf0d599ee4c8da Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 30 Jan 2015 13:12:38 +1100 Subject: kexec: Fix make headers_check Remove the unneded declaration for a kexec_load() routine. Fixes errors like these when running 'make headers_check': include/uapi/linux/kexec.h: userspace cannot reference function or variable defined in the kernel Paul said: : The kexec_load declaration isn't very useful for userspace, see the patch : I submitted in http://lkml.kernel.org/r/1389791824.17407.9.camel@x220 . : And After my attempt the export of that declaration has also been : discussed in : http://lkml.kernel.org/r/115373b6ac68ee7a305975896e1c4971e8e51d4c.1408731991.git.geoff@infradead.org : : In that last discussion no one has been able to point to an actual user of : it. So, as far as I can tell, no one actually uses it. Which makes : sense, because including this header by itself doesn't give one access to : a useful definition of kexec_load. So why bother with the declaration? Signed-off-by: Geoff Levand Acked-by: Paul Bolle Cc: H. Peter Anvin Cc: Vivek Goyal Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Maximilian Attems Cc: Michal Marek Signed-off-by: Andrew Morton --- include/uapi/linux/kexec.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 6925f5b42f89..99048e501b88 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -55,12 +55,6 @@ struct kexec_segment { size_t memsz; }; -/* Load a new kernel image as described by the kexec_segment array - * consisting of passed number of segments at the entry-point address. - * The flags allow different useage types. - */ -extern int kexec_load(void *, size_t, struct kexec_segment *, - unsigned long int); #endif /* __KERNEL__ */ #endif /* _UAPILINUX_KEXEC_H */ -- cgit v1.2.3 From e9b7d3d7ea50ee229ff6ffda5f406f80332b0e25 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 30 Jan 2015 13:12:38 +1100 Subject: kexec: add bit definitions for kimage entry flags Define new kexec preprocessor macros IND_*_BIT that define the bit position of the kimage entry flags. Change the existing IND_* flag macros to be defined as bit shifts of the corresponding IND_*_BIT macros. Also wrap all C language code in kexec.h with #if !defined(__ASSEMBLY__) so assembly files can include kexec.h to get the IND_* and IND_*_BIT macros. Some CPU instruction sets have tests for bit position which are convenient in implementing routines that operate on the kimage entry list. The addition of these bit position macros in a common location will avoid duplicate definitions and the chance that changes to the IND_* flags will not be propagated to assembly files. Signed-off-by: Geoff Levand Acked-by: Vivek Goyal Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: H. Peter Anvin Cc: Maximilian Attems Cc: Michal Marek Cc: Paul Bolle Signed-off-by: Andrew Morton --- include/linux/kexec.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 10da8e246317..1fd980cc481b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -1,6 +1,18 @@ #ifndef LINUX_KEXEC_H #define LINUX_KEXEC_H +#define IND_DESTINATION_BIT 0 +#define IND_INDIRECTION_BIT 1 +#define IND_DONE_BIT 2 +#define IND_SOURCE_BIT 3 + +#define IND_DESTINATION (1 << IND_DESTINATION_BIT) +#define IND_INDIRECTION (1 << IND_INDIRECTION_BIT) +#define IND_DONE (1 << IND_DONE_BIT) +#define IND_SOURCE (1 << IND_SOURCE_BIT) + +#if !defined(__ASSEMBLY__) + #include #ifdef CONFIG_KEXEC @@ -64,10 +76,6 @@ */ typedef unsigned long kimage_entry_t; -#define IND_DESTINATION 0x1 -#define IND_INDIRECTION 0x2 -#define IND_DONE 0x4 -#define IND_SOURCE 0x8 struct kexec_segment { /* @@ -311,4 +319,7 @@ struct task_struct; static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } #endif /* CONFIG_KEXEC */ + +#endif /* !defined(__ASSEBMLY__) */ + #endif /* LINUX_KEXEC_H */ -- cgit v1.2.3 From 1671a417cf8345aa32e6e8d850c5583979bf2e5d Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 30 Jan 2015 13:12:38 +1100 Subject: kexec: add IND_FLAGS macro Add a new kexec preprocessor macro IND_FLAGS, which is the bitwise OR of all the possible kexec IND_ kimage_entry indirection flags. Having this macro allows for simplified code in the prosessing of the kexec kimage_entry items. Also, remove the local powerpc definition and use the generic one. Signed-off-by: Geoff Levand Acked-by: Benjamin Herrenschmidt Acked-by: Vivek Goyal Cc: Arnd Bergmann Cc: Maximilian Attems Cc: Michal Marek Cc: H. Peter Anvin Cc: Paul Bolle Signed-off-by: Andrew Morton --- arch/powerpc/kernel/machine_kexec_64.c | 2 -- include/linux/kexec.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index f96d1ec24189..1a74446fd9e5 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -96,8 +96,6 @@ int default_machine_kexec_prepare(struct kimage *image) return 0; } -#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) - static void copy_segments(unsigned long ind) { unsigned long entry; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1fd980cc481b..e60a745ac198 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -10,6 +10,7 @@ #define IND_INDIRECTION (1 << IND_INDIRECTION_BIT) #define IND_DONE (1 << IND_DONE_BIT) #define IND_SOURCE (1 << IND_SOURCE_BIT) +#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) #if !defined(__ASSEMBLY__) -- cgit v1.2.3 From 0ac6cd004514d47cc8fca56e9ea8015aa4d9b9d0 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Fri, 30 Jan 2015 13:12:39 +1100 Subject: kdump, vmcoreinfo: report actual value of phys_base Currently, VMCOREINFO note information reports the virtual address of phys_base that is assigned to symbol phys_base. But this doesn't make sense because to refer to phys_base, it's necessary to get the value of phys_base itself we are now about to refer to. Userland tools related to kdump such as makedumpfile and crash utility so far have made some efforts to calculate phys_base on crash dump formats generated by mechanisms running outside Linux kernel, such as virtual machine hypervisor such as qemu dump, which ordinary users use via virsh dump, or ones implemented on vendor specific firmware. That is, find a kernel data whose virtual and physical addresses are available via its note information and calculate phys_base from it. However, such data structure is not the one prepared for phys_base purpose. There's no guarantee that other crash dump mechanisms include such information that can be used to calculate phys_base similarly. To get VMCOREINFO in vmcore, it's easy to use strings and grep commands like this; VMCOREINFO consists of simple string: $ strings vmcore-3.10.0-121.el7.x86_64 | grep -E ".*VMCOREINFO.*" -A 100 VMCOREINFO OSRELEASE=3.10.0-121.el7.x86_64 PAGESIZE=4096 ... This is also useful to get value of phys_base in kdump 2nd kernel contained in vmcore using the above-mentioned external crash dump mechanism; kdump 2nd kernel is an inherently relocated kernel. This commit doesn't remove VMCOREINFO_SYMBOL(phys_base) line because makedumpfile refers to it and if removing it, old versions makedumpfile doesn't work well. Signed-off-by: HATAYAMA Daisuke Cc: Eric W. Biederman Cc: Vivek Goyal Cc: Atsushi Kumagai Cc: Dave Anderson Signed-off-by: Andrew Morton --- arch/x86/kernel/machine_kexec_64.c | 1 + include/linux/kexec.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..ab48c252e661 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -335,6 +335,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", (unsigned long)&_text - __START_KERNEL); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index e60a745ac198..87e37264d464 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -265,6 +265,8 @@ unsigned long paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; -- cgit v1.2.3 From 27a419fd0e0a2297607450871e2fdc5e7f29ad3a Mon Sep 17 00:00:00 2001 From: John de la Garza Date: Fri, 30 Jan 2015 13:12:39 +1100 Subject: lib/rbtree.c: fix typo in comment Signed-off-by: John de la Garza Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 57e75ae9910f..fb31765e935a 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -51,7 +51,7 @@ struct rb_root { #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -/* 'empty' nodes are nodes that are known not to be inserted in an rbree */ +/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ -- cgit v1.2.3 From 78ccbbfb50bf6ad3415760fec43c5808ac9b3b26 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 30 Jan 2015 13:46:54 -0800 Subject: mm: remove remaining references to NUMA hinting bits and helpers -fix The patch "mm: remove remaining references to NUMA hinting bits and helpers" was meant to remove NUMA hinting bits and helpers but broke is_swap_pte. >From the original bug report For at least the past couple of days tests of libhugetlbfs have been hanging on mustang in the mlock test running ARMv8 defconfig with both 32 bit and 64 bit userspace - after the mprotect test (the one before it) we get no console output for several hours so it appears that the test has deadlocked. The tail of the log is: | find_path (2M: 64): PASS | unlinked_fd (2M: 64): PASS | readback (2M: 64): PASS | truncate (2M: 64): PASS | shared (2M: 64): PASS | mprotect (2M: 64): PASS This patch should address the problem and is a fix to the mmotm patch mm-remove-remaining-references-to-numa-hinting-bits-and-helpers.patch Signed-off-by: Mel Gorman Reported-by: Mark Brown Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/swapops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 73ca28070a92..cedf3d3c373f 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte); + return !pte_none(pte) && !pte_present(pte); } #endif -- cgit v1.2.3