diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 29 | ||||
-rw-r--r-- | mm/Makefile | 14 | ||||
-rw-r--r-- | mm/backing-dev.c | 6 | ||||
-rw-r--r-- | mm/bootmem.c | 138 | ||||
-rw-r--r-- | mm/bounce.c | 8 | ||||
-rw-r--r-- | mm/cleancache.c | 6 | ||||
-rw-r--r-- | mm/compaction.c | 432 | ||||
-rw-r--r-- | mm/filemap.c | 108 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/frontswap.c | 344 | ||||
-rw-r--r-- | mm/huge_memory.c | 29 | ||||
-rw-r--r-- | mm/hugetlb.c | 34 | ||||
-rw-r--r-- | mm/internal.h | 42 | ||||
-rw-r--r-- | mm/madvise.c | 29 | ||||
-rw-r--r-- | mm/memblock.c | 155 | ||||
-rw-r--r-- | mm/memcontrol.c | 763 | ||||
-rw-r--r-- | mm/memory-failure.c | 30 | ||||
-rw-r--r-- | mm/memory.c | 70 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 22 | ||||
-rw-r--r-- | mm/mempolicy.c | 91 | ||||
-rw-r--r-- | mm/migrate.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 152 | ||||
-rw-r--r-- | mm/mmzone.c | 14 | ||||
-rw-r--r-- | mm/mremap.c | 26 | ||||
-rw-r--r-- | mm/nobootmem.c | 150 | ||||
-rw-r--r-- | mm/nommu.c | 35 | ||||
-rw-r--r-- | mm/oom_kill.c | 59 | ||||
-rw-r--r-- | mm/page-writeback.c | 110 | ||||
-rw-r--r-- | mm/page_alloc.c | 495 | ||||
-rw-r--r-- | mm/page_cgroup.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 12 | ||||
-rw-r--r-- | mm/page_isolation.c | 15 | ||||
-rw-r--r-- | mm/pagewalk.c | 1 | ||||
-rw-r--r-- | mm/percpu-vm.c | 1 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 4 | ||||
-rw-r--r-- | mm/process_vm_access.c | 16 | ||||
-rw-r--r-- | mm/readahead.c | 40 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 566 | ||||
-rw-r--r-- | mm/slab.c | 406 | ||||
-rw-r--r-- | mm/slab.h | 33 | ||||
-rw-r--r-- | mm/slab_common.c | 120 | ||||
-rw-r--r-- | mm/slob.c | 152 | ||||
-rw-r--r-- | mm/slub.c | 451 | ||||
-rw-r--r-- | mm/sparse.c | 33 | ||||
-rw-r--r-- | mm/swap.c | 129 | ||||
-rw-r--r-- | mm/swapfile.c | 99 | ||||
-rw-r--r-- | mm/thrash.c | 155 | ||||
-rw-r--r-- | mm/truncate.c | 25 | ||||
-rw-r--r-- | mm/util.c | 30 | ||||
-rw-r--r-- | mm/vmalloc.c | 43 | ||||
-rw-r--r-- | mm/vmscan.c | 755 | ||||
-rw-r--r-- | mm/vmstat.c | 13 |
53 files changed, 3604 insertions, 2909 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e338407f1225..82fed4eb2b6f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -198,7 +198,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in @@ -349,6 +349,16 @@ choice benefit. endchoice +config CROSS_MEMORY_ATTACH + bool "Cross Memory Support" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to to another process's address space. + See the man page for more details. + # # UP and nommu archs use km based percpu allocator # @@ -379,3 +389,20 @@ config CLEANCACHE in a negligible performance hit. If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap to cache swap pages if tmem is present" + depends on SWAP + default n + help + Frontswap is so named because it can be thought of as the opposite + of a "backing" store for a swap device. The data is stored into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. When space in transcendent memory is available, + a significant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit + and swap data is stored as normal on the matching swap device. + + If unsure, say Y to enable frontswap. diff --git a/mm/Makefile b/mm/Makefile index 50ec00ef2a0e..8e81fe263c94 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,15 +5,19 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o \ - process_vm_access.o + vmalloc.o pagewalk.o pgtable-generic.o + +ifdef CONFIG_CROSS_MEMORY_ATTACH +mmu-$(CONFIG_MMU) += process_vm_access.o +endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o mmu_context.o percpu.o \ - $(mmu-y) + compaction.o slab_common.o $(mmu-y) + obj-y += init-mm.o ifdef CONFIG_NO_BOOTMEM @@ -25,14 +29,14 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o -obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..3387aea11209 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100; - bdi->max_prop_frac = PROP_FRAC_BASE; + bdi->max_prop_frac = FPROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->work_list); @@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = prop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions); if (err) { err: @@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi) for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - prop_local_destroy_percpu(&bdi->completions); + fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..bcb63ac48cc5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) */ static void __init link_bootmem(bootmem_data_t *bdata) { - struct list_head *iter; + bootmem_data_t *ent; - list_for_each(iter, &bdata_list) { - bootmem_data_t *ent; - - ent = list_entry(iter, bootmem_data_t, list); - if (bdata->node_min_pfn < ent->node_min_pfn) - break; + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_min_pfn < ent->node_min_pfn) { + list_add_tail(&bdata->list, &ent->list); + return; + } } - list_add_tail(&bdata->list, iter); + + list_add_tail(&bdata->list, &bdata_list); } /* @@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } else { unsigned long off = 0; - while (vec && off < BITS_PER_LONG) { + vec >>= start & (BITS_PER_LONG - 1); + while (vec) { if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); @@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata, return ALIGN(base + off, align) - base; } -static void * __init alloc_bootmem_core(struct bootmem_data *bdata, +static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); if (p_bdata) - return alloc_bootmem_core(p_bdata, size, align, + return alloc_bootmem_bdata(p_bdata, size, align, goal, limit); } #endif return NULL; } -static void * __init ___alloc_bootmem_nopanic(unsigned long size, +static void * __init alloc_bootmem_core(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, bootmem_data_t *bdata; void *region; -restart: region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); if (region) return region; @@ -614,11 +614,25 @@ restart: if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) break; - region = alloc_bootmem_core(bdata, size, align, goal, limit); + region = alloc_bootmem_bdata(bdata, size, align, goal, limit); if (region) return region; } + return NULL; +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + +restart: + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; if (goal) { goal = 0; goto restart; @@ -684,21 +698,60 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { void *ptr; - ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); +again: + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, + align, goal, limit); if (ptr) return ptr; - ptr = alloc_bootmem_core(bdata, size, align, goal, limit); + /* do not panic in alloc_bootmem_bdata() */ + if (limit && goal + size > limit) + limit = 0; + + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); if (ptr) return ptr; - return ___alloc_bootmem(size, align, goal, limit); + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; + + if (goal) { + goal = 0; + goto again; + } + + return NULL; +} + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); +} + +void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) +{ + void *ptr; + + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } /** @@ -722,7 +775,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -743,7 +796,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; - ptr = alloc_bootmem_core(pgdat->bdata, size, align, + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, new_goal, 0); if (ptr) return ptr; @@ -754,47 +807,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifdef CONFIG_SPARSEMEM -/** - * alloc_bootmem_section - allocate boot memory from a specific section - * @size: size of the request in bytes - * @section_nr: sparse map section to allocate from - * - * Return NULL on failure. - */ -void * __init alloc_bootmem_section(unsigned long size, - unsigned long section_nr) -{ - bootmem_data_t *bdata; - unsigned long pfn, goal; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); -} -#endif - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - return __alloc_bootmem_nopanic(size, align, goal); -} - #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif @@ -839,6 +851,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c index d1be02ca1889..042086775561 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -24,23 +24,25 @@ static mempool_t *page_pool, *isa_page_pool; -#ifdef CONFIG_HIGHMEM +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) static __init int init_emergency_pool(void) { -#ifndef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) if (max_pfn <= max_low_pfn) return 0; #endif page_pool = mempool_create_page_pool(POOL_SIZE, 0); BUG_ON(!page_pool); - printk("highmem bounce pool size: %d pages\n", POOL_SIZE); + printk("bounce pool size: %d pages\n", POOL_SIZE); return 0; } __initcall(init_emergency_pool); +#endif +#ifdef CONFIG_HIGHMEM /* * highmem version, map in to vec */ diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c740f613..32e6f4136fa2 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); static int cleancache_get_key(struct inode *inode, struct cleancache_filekey *key) { - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); int len = 0, maxlen = CLEANCACHE_KEY_MAX; struct super_block *sb = inode->i_sb; @@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode, if (sb->s_export_op != NULL) { fhfn = sb->s_export_op->encode_fh; if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); + len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); if (len <= 0 || len == 255) return -1; if (maxlen > CLEANCACHE_KEY_MAX) diff --git a/mm/compaction.c b/mm/compaction.c index 74a8c825ff28..2f42d9528539 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,30 +16,11 @@ #include <linux/sysfs.h> #include "internal.h" +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> -/* - * compact_control is used to track pages being migrated and the free pages - * they are being migrated to during memory compaction. The free_pfn starts - * at the end of a zone and migrate_pfn begins at the start. Movable pages - * are moved to the end of a zone during a compaction run and the run - * completes when free_pfn <= migrate_pfn - */ -struct compact_control { - struct list_head freepages; /* List of free pages to migrate to */ - struct list_head migratepages; /* List of pages being migrated */ - unsigned long nr_freepages; /* Number of isolated free pages */ - unsigned long nr_migratepages; /* Number of pages to migrate */ - unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ - - int order; /* order a direct compactor needs */ - int migratetype; /* MOVABLE, RECLAIMABLE etc */ - struct zone *zone; -}; - static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -54,24 +35,35 @@ static unsigned long release_freepages(struct list_head *freelist) return count; } -/* Isolate free pages onto a private freelist. Must hold zone->lock */ -static unsigned long isolate_freepages_block(struct zone *zone, - unsigned long blockpfn, - struct list_head *freelist) +static void map_pages(struct list_head *list) +{ + struct page *page; + + list_for_each_entry(page, list, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } +} + +static inline bool migrate_async_suitable(int migratetype) +{ + return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; +} + +/* + * Isolate free pages onto a private freelist. Caller must hold zone->lock. + * If @strict is true, will abort returning 0 on any invalid PFNs or non-free + * pages inside of the pageblock (even though it may still end up isolating + * some pages). + */ +static unsigned long isolate_freepages_block(unsigned long blockpfn, + unsigned long end_pfn, + struct list_head *freelist, + bool strict) { - unsigned long zone_end_pfn, end_pfn; int nr_scanned = 0, total_isolated = 0; struct page *cursor; - /* Get the last PFN we should scan for free pages at */ - zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); - - /* Find the first usable PFN in the block to initialse page cursor */ - for (; blockpfn < end_pfn; blockpfn++) { - if (pfn_valid_within(blockpfn)) - break; - } cursor = pfn_to_page(blockpfn); /* Isolate free pages. This assumes the block is valid */ @@ -79,15 +71,23 @@ static unsigned long isolate_freepages_block(struct zone *zone, int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) + if (!pfn_valid_within(blockpfn)) { + if (strict) + return 0; continue; + } nr_scanned++; - if (!PageBuddy(page)) + if (!PageBuddy(page)) { + if (strict) + return 0; continue; + } /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); + if (!isolated && strict) + return 0; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -105,114 +105,71 @@ static unsigned long isolate_freepages_block(struct zone *zone, return total_isolated; } -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; - - /* If the page is a large free page, then allow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; - - /* If the block is MIGRATE_MOVABLE, allow migration */ - if (migratetype == MIGRATE_MOVABLE) - return true; - - /* Otherwise skip the block */ - return false; -} - -/* - * Based on information in the current compact_control, find blocks - * suitable for isolating free pages from and then isolate them. +/** + * isolate_freepages_range() - isolate free pages. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Non-free pages, invalid PFNs, or zone boundaries within the + * [start_pfn, end_pfn) range are considered errors, cause function to + * undo its actions and return zero. + * + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater then end_pfn if end fell in a middle of + * a free page). */ -static void isolate_freepages(struct zone *zone, - struct compact_control *cc) +unsigned long +isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) { - struct page *page; - unsigned long high_pfn, low_pfn, pfn; - unsigned long flags; - int nr_freepages = cc->nr_freepages; - struct list_head *freelist = &cc->freepages; - - /* - * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. - */ - pfn = cc->free_pfn; - low_pfn = cc->migrate_pfn + pageblock_nr_pages; + unsigned long isolated, pfn, block_end_pfn, flags; + struct zone *zone = NULL; + LIST_HEAD(freelist); - /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. - */ - high_pfn = min(low_pfn, pfn); + if (pfn_valid(start_pfn)) + zone = page_zone(pfn_to_page(start_pfn)); - /* - * Isolate free pages until enough are available to migrate the - * pages on cc->migratepages. We stop searching if the migrate - * and free page scanners meet or enough free pages are isolated. - */ - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { - unsigned long isolated; - - if (!pfn_valid(pfn)) - continue; + for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { + if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) + break; /* - * Check for overlapping nodes/zones. It's possible on some - * configurations to have a setup like - * node0 node1 node0 - * i.e. it's possible that all pages within a zones range of - * pages do not belong to a single zone. + * On subsequent iterations ALIGN() is actually not needed, + * but we keep it that we not to complicate the code. */ - page = pfn_to_page(pfn); - if (page_zone(page) != zone) - continue; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); - /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) - continue; + spin_lock_irqsave(&zone->lock, flags); + isolated = isolate_freepages_block(pfn, block_end_pfn, + &freelist, true); + spin_unlock_irqrestore(&zone->lock, flags); /* - * Found a block suitable for isolating free pages from. Now - * we disabled interrupts, double check things are ok and - * isolate the pages. This is to minimise the time IRQs - * are disabled + * In strict mode, isolate_freepages_block() returns 0 if + * there are any holes in the block (ie. invalid PFNs or + * non-free pages). */ - isolated = 0; - spin_lock_irqsave(&zone->lock, flags); - if (suitable_migration_target(page)) { - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; - } - spin_unlock_irqrestore(&zone->lock, flags); + if (!isolated) + break; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * If we managed to isolate pages, it is always (1 << n) * + * pageblock_nr_pages for some non-negative n. (Max order + * page may span two pageblocks). */ - if (isolated) - high_pfn = max(high_pfn, pfn); } /* split_free_page does not map the pages */ - list_for_each_entry(page, freelist, lru) { - arch_alloc_page(page, 0); - kernel_map_pages(page, 1, 1); + map_pages(&freelist); + + if (pfn < end_pfn) { + /* Loop terminated early, cleanup. */ + release_freepages(&freelist); + return 0; } - cc->free_pfn = high_pfn; - cc->nr_freepages = nr_freepages; + /* We don't use freelists for anything. */ + return pfn; } /* Update the number of anon and file isolated pages in the zone */ @@ -243,37 +200,34 @@ static bool too_many_isolated(struct zone *zone) return isolated > (inactive + active) / 2; } -/* possible outcome of isolate_migratepages */ -typedef enum { - ISOLATE_ABORT, /* Abort compaction now */ - ISOLATE_NONE, /* No pages isolated, continue scanning */ - ISOLATE_SUCCESS, /* Pages isolated, migrate */ -} isolate_migrate_t; - -/* - * Isolate all pages that can be migrated from the block pointed to by - * the migrate scanner within compact_control. +/** + * isolate_migratepages_range() - isolate all migrate-able pages in range. + * @zone: Zone pages are in. + * @cc: Compaction control structure. + * @low_pfn: The first PFN of the range. + * @end_pfn: The one-past-the-last PFN of the range. + * + * Isolate all pages that can be migrated from the range specified by + * [low_pfn, end_pfn). Returns zero if there is a fatal signal + * pending), otherwise PFN of the first page that was not scanned + * (which may be both less, equal to or more then end_pfn). + * + * Assumes that cc->migratepages is empty and cc->nr_migratepages is + * zero. + * + * Apart from cc->migratepages and cc->nr_migratetypes this function + * does not modify any cc's fields, in particular it does not modify + * (or read for that matter) cc->migrate_pfn. */ -static isolate_migrate_t isolate_migratepages(struct zone *zone, - struct compact_control *cc) +unsigned long +isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn) { - unsigned long low_pfn, end_pfn; unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; - - /* Do not scan outside zone boundaries */ - low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); - - /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); - - /* Do not cross the free scanner or scan within a memory hole */ - if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { - cc->migrate_pfn = end_pfn; - return ISOLATE_NONE; - } + isolate_mode_t mode = 0; + struct lruvec *lruvec; /* * Ensure that there are not too many pages isolated from the LRU @@ -283,12 +237,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ if (!cc->sync) - return ISOLATE_ABORT; + return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return ISOLATE_ABORT; + return 0; } /* Time to isolate some pages for migration */ @@ -351,7 +305,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, */ pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && - get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + !migrate_async_suitable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; last_pageblock_nr = pageblock_nr; @@ -374,14 +328,16 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; + lruvec = mem_cgroup_page_lruvec(page, zone); + /* Try isolate the page */ - if (__isolate_lru_page(page, mode, 0) != 0) + if (__isolate_lru_page(page, mode) != 0) continue; VM_BUG_ON(PageTransCompound(page)); /* Successfully isolated */ - del_page_from_lru_list(zone, page, page_lru(page)); + del_page_from_lru_list(page, lruvec, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; @@ -396,11 +352,124 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, acct_isolated(zone, cc); spin_unlock_irq(&zone->lru_lock); - cc->migrate_pfn = low_pfn; trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - return ISOLATE_SUCCESS; + return low_pfn; +} + +#endif /* CONFIG_COMPACTION || CONFIG_CMA */ +#ifdef CONFIG_COMPACTION + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; + unsigned long flags; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + /* + * Initialise the free scanner. The starting point is where we last + * scanned from (or the end of the zone if starting). The low point + * is the end of the pageblock the migration scanner is using. + */ + pfn = cc->free_pfn; + low_pfn = cc->migrate_pfn + pageblock_nr_pages; + + /* + * Take care that if the migration scanner is at the end of the zone + * that the free scanner does not accidentally move to the next zone + * in the next isolation cycle. + */ + high_pfn = min(low_pfn, pfn); + + zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* + * Found a block suitable for isolating free pages from. Now + * we disabled interrupts, double check things are ok and + * isolate the pages. This is to minimise the time IRQs + * are disabled + */ + isolated = 0; + spin_lock_irqsave(&zone->lock, flags); + if (suitable_migration_target(page)) { + end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); + isolated = isolate_freepages_block(pfn, end_pfn, + freelist, false); + nr_freepages += isolated; + } + spin_unlock_irqrestore(&zone->lock, flags); + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) + high_pfn = max(high_pfn, pfn); + } + + /* split_free_page does not map the pages */ + map_pages(freelist); + + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; } /* @@ -449,6 +518,44 @@ static void update_nr_listpages(struct compact_control *cc) cc->nr_freepages = nr_freepages; } +/* possible outcome of isolate_migratepages */ +typedef enum { + ISOLATE_ABORT, /* Abort compaction now */ + ISOLATE_NONE, /* No pages isolated, continue scanning */ + ISOLATE_SUCCESS, /* Pages isolated, migrate */ +} isolate_migrate_t; + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static isolate_migrate_t isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return ISOLATE_NONE; + } + + /* Perform the isolation */ + low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); + if (!low_pfn) + return ISOLATE_ABORT; + + cc->migrate_pfn = low_pfn; + + return ISOLATE_SUCCESS; +} + static int compact_finished(struct zone *zone, struct compact_control *cc) { @@ -594,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; + if (err == -ENOMEM) { + ret = COMPACT_PARTIAL; + goto out; + } } - } out: @@ -795,3 +905,5 @@ void compaction_unregister_node(struct node *node) return device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ + +#endif /* CONFIG_COMPACTION */ diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b0b14e..a4a5260b0279 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,7 +29,6 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/security.h> -#include <linux/syscalls.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> @@ -1478,44 +1477,6 @@ out: } EXPORT_SYMBOL(generic_file_aio_read); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) - return -EINVAL; - - force_page_cache_readahead(mapping, filp, index, nr); - return 0; -} - -SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) -{ - ssize_t ret; - struct file *file; - - ret = -EBADF; - file = fget(fd); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; - pgoff_t start = offset >> PAGE_CACHE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); - } - fput(file); - } - return ret; -} -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_readahead(long fd, loff_t offset, long count) -{ - return SYSC_readahead((int) fd, offset, (size_t) count); -} -SYSCALL_ALIAS(sys_readahead, SyS_readahead); -#endif - #ifdef CONFIG_MMU /** * page_cache_read - adds requested page to the page cache if not already there @@ -1938,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page); -/* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = dentry->d_inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - -static int __remove_suid(struct dentry *dentry, int kill) -{ - struct iattr newattrs; - - newattrs.ia_valid = ATTR_FORCE | kill; - return notify_change(dentry, &newattrs); -} - -int file_remove_suid(struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - int killsuid; - int killpriv; - int error = 0; - - /* Fast path for nothing security related */ - if (IS_NOSEC(inode)) - return 0; - - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; - - return error; -} -EXPORT_SYMBOL(file_remove_suid); - static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { @@ -2528,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - file_update_time(file); + err = file_update_time(file); + if (err) + goto out; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a4eb31132229..213ca1f53409 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (ret) goto out_backing; - file_update_time(filp); + ret = file_update_time(filp); + if (ret) + goto out_backing; ret = __xip_file_write (filp, buf, count, pos, ppos); diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..6b3e71a2cd48 --- /dev/null +++ b/mm/frontswap.c @@ -0,0 +1,344 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/security.h> +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops frontswap_ops __read_mostly; + +/* + * This global enablement flag reduces overhead on systems where frontswap_ops + * has not been registered, so is preferred to the slower alternative: a + * function call that checks a non-global. + */ +bool frontswap_enabled __read_mostly; +EXPORT_SYMBOL(frontswap_enabled); + +/* + * If enabled, frontswap_store will return failure even on success. As + * a result, the swap subsystem will always write the page to swap, in + * effect converting frontswap into a writethrough cache. In this mode, + * there is no direct reduction in swap writes, but a frontswap backend + * can unilaterally "reclaim" any pages in use with no data loss, thus + * providing increases control over maximum memory usage due to frontswap. + */ +static bool frontswap_writethrough_enabled __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_loads(void) { + frontswap_loads++; +} +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; +} +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; +} +static inline void inc_frontswap_invalidates(void) { + frontswap_invalidates++; +} +#else +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif +/* + * Register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting. + */ +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops old = frontswap_ops; + + frontswap_ops = *ops; + frontswap_enabled = true; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* + * Enable/disable frontswap writethrough (see above). + */ +void frontswap_writethrough(bool enable) +{ + frontswap_writethrough_enabled = enable; +} +EXPORT_SYMBOL(frontswap_writethrough); + +/* + * Called when a swap device is swapon'd. + */ +void __frontswap_init(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + frontswap_ops.init(type); +} +EXPORT_SYMBOL(__frontswap_init); + +static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ + frontswap_clear(sis, offset); + atomic_dec(&sis->frontswap_pages); +} + +/* + * "Store" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implementation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_store(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + dup = 1; + ret = frontswap_ops.store(type, offset, page); + if (ret == 0) { + frontswap_set(sis, offset); + inc_frontswap_succ_stores(); + if (!dup) + atomic_inc(&sis->frontswap_pages); + } else { + /* + failed dup always results in automatic invalidate of + the (older) page from frontswap + */ + inc_frontswap_failed_stores(); + if (dup) + __frontswap_clear(sis, offset); + } + if (frontswap_writethrough_enabled) + /* report failure so swap also writes to swap device */ + ret = -1; + return ret; +} +EXPORT_SYMBOL(__frontswap_store); + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_load(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + ret = frontswap_ops.load(type, offset, page); + if (ret == 0) + inc_frontswap_loads(); + return ret; +} +EXPORT_SYMBOL(__frontswap_load); + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) { + frontswap_ops.invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); + } +} +EXPORT_SYMBOL(__frontswap_invalidate_page); + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + frontswap_ops.invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); +} +EXPORT_SYMBOL(__frontswap_invalidate_area); + +static unsigned long __frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + assert_spin_locked(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += atomic_read(&si->frontswap_pages); + } + return totalpages; +} + +static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, + int *swapid) +{ + int ret = -EINVAL; + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages_to_unuse = total; + unsigned long pages = 0, pages_to_unuse = 0; + int type; + + assert_spin_locked(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) { + pages = pages_to_unuse = total_pages_to_unuse; + } else { + pages = si_frontswap_pages; + pages_to_unuse = 0; /* unuse all */ + } + /* ensure there is enough RAM to fetch pages from frontswap */ + if (security_vm_enough_memory_mm(current->mm, pages)) { + ret = -ENOMEM; + continue; + } + vm_unacct_memory(pages); + *unused = pages_to_unuse; + *swapid = type; + ret = 0; + break; + } + + return ret; +} + +static int __frontswap_shrink(unsigned long target_pages, + unsigned long *pages_to_unuse, + int *type) +{ + unsigned long total_pages = 0, total_pages_to_unuse; + + assert_spin_locked(&swap_lock); + + total_pages = __frontswap_curr_pages(); + if (total_pages <= target_pages) { + /* Nothing to do */ + *pages_to_unuse = 0; + return 0; + } + total_pages_to_unuse = total_pages - target_pages; + return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); +} + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + unsigned long pages_to_unuse = 0; + int type, ret; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); + spin_unlock(&swap_lock); + if (ret == 0 && pages_to_unuse) + try_to_unuse(type, true, pages_to_unuse); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * Count and return the number of frontswap pages across all + * swap devices. This is exported so that backend drivers can + * determine current usage without reading debugfs. + */ +unsigned long frontswap_curr_pages(void) +{ + unsigned long totalpages = 0; + + spin_lock(&swap_lock); + totalpages = __frontswap_curr_pages(); + spin_unlock(&swap_lock); + + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", S_IRUGO, + root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb55..57c4b9309015 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { - int ret = 0; pgtable_t pgtable; VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) { - mem_cgroup_uncharge_page(page); - put_page(page); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - } clear_huge_page(page, haddr, HPAGE_PMD_NR); __SetPageUptodate(page); @@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); } - return ret; + return 0; } static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) @@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); goto out; } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, + page))) { + mem_cgroup_uncharge_page(page); + put_page(page); + goto out; + } - return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); + return 0; } out: /* @@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) + split_huge_page(page); put_page(page); goto out; } @@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); + split_huge_page(page); put_page(page); ret |= VM_FAULT_OOM; goto out; @@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { + spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); + goto out; } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); @@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page) { int i; struct zone *zone = page_zone(page); + struct lruvec *lruvec; int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + compound_lock(page); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(page); @@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - - lru_add_page_tail(zone, page, page_tail); + lru_add_page_tail(page, page_tail, lruvec); } atomic_sub(tail_count, &page->_count); BUG_ON(atomic_read(&page->_count) <= 0); - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); ClearPageCompound(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ae8f708e3d75..e198831276a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t) /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { - int seg_from; - int seg_to; + long seg_from; + long seg_to; if (rg->to <= f) continue; @@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) kref_get(&reservations->refs); } +static void resv_map_put(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + if (!reservations) + return; + kref_put(&reservations->refs, resv_map_release); +} + static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); @@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) reserve = (end - start) - region_count(&reservations->regions, start, end); - kref_put(&reservations->refs, resv_map_release); + resv_map_put(vma); if (reserve) { hugetlb_acct_memory(h, -reserve); @@ -2213,6 +2222,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); + entry = arch_make_huge_pte(entry, vma, page, writable); return entry; } @@ -2990,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) - return chg; + if (chg < 0) { + ret = chg; + goto out_err; + } /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) - return -ENOSPC; + if (hugepage_subpool_get_pages(spool, chg)) { + ret = -ENOSPC; + goto out_err; + } /* * Check enough hugepages are available for the reservation. @@ -3004,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode, ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugepage_subpool_put_pages(spool, chg); - return ret; + goto out_err; } /* @@ -3021,6 +3035,10 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; +out_err: + if (vma) + resv_map_put(vma); + return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) diff --git a/mm/internal.h b/mm/internal.h index 2189af491783..2ba87fbfb75b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -100,6 +100,39 @@ extern void prep_compound_page(struct page *page, unsigned long order); extern bool is_free_buddy_page(struct page *page); #endif +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + +/* + * in mm/compaction.c + */ +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ + + int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +}; + +unsigned long +isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); +unsigned long +isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); + +#endif /* * function for dealing with page's order in buddy system. @@ -131,7 +164,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) * to determine if it's being mapped into a LOCKED vma. * If so, mark page as mlocked. */ -static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +static inline int mlocked_vma_newpage(struct vm_area_struct *vma, + struct page *page) { VM_BUG_ON(PageLRU(page)); @@ -189,7 +223,7 @@ extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) { return 0; } @@ -309,3 +343,7 @@ extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; + +extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b6674..14d260fa0d17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,8 +11,11 @@ #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/hugetlb.h> +#include <linux/falloc.h> #include <linux/sched.h> #include <linux/ksm.h> +#include <linux/fs.h> +#include <linux/file.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -200,33 +203,39 @@ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { - struct address_space *mapping; - loff_t offset, endoff; + loff_t offset; int error; + struct file *f; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; - if (!vma->vm_file || !vma->vm_file->f_mapping - || !vma->vm_file->f_mapping->host) { + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) return -EACCES; - mapping = vma->vm_file->f_mapping; - offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - endoff = (loff_t)(end - vma->vm_start - 1) - + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex */ + /* + * Filesystem's fallocate may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); up_read(¤t->mm->mmap_sem); - error = vmtruncate_range(mapping->host, offset, endoff); + error = do_fallocate(f, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); + fput(f); down_read(¤t->mm->mmap_sem); return error; } diff --git a/mm/memblock.c b/mm/memblock.c index a44eab3157f8..5cc6731b00cc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = { int memblock_debug __initdata_memblock; static int memblock_can_resize __initdata_memblock; +static int memblock_memory_in_slab __initdata_memblock = 0; +static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ static inline const char *memblock_type_name(struct memblock_type *type) @@ -141,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, MAX_NUMNODES); } -/* - * Free memblock.reserved.regions - */ -int __init_memblock memblock_free_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_free(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - -/* - * Reserve memblock.reserved.regions - */ -int __init_memblock memblock_reserve_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_reserve(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { type->total_size -= type->regions[r].size; @@ -182,11 +160,42 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -static int __init_memblock memblock_double_array(struct memblock_type *type) +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); + int *in_slab; /* We don't allow resizing until we know about the reserved regions * of memory that aren't suitable for allocation @@ -197,6 +206,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); + + /* Retrieve the slab flag */ + if (type == &memblock.memory) + in_slab = &memblock_memory_in_slab; + else + in_slab = &memblock_reserved_in_slab; /* Try to find some space for it. * @@ -212,14 +233,26 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; - } else - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + } else { + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_alloc_size, PAGE_SIZE); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_alloc_size, PAGE_SIZE); + + new_array = addr ? __va(addr) : 0; + } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; } - new_array = __va(addr); memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); @@ -234,21 +267,23 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) type->regions = new_array; type->max <<= 1; - /* If we use SLAB that's it, we are done */ - if (use_slab) - return 0; - - /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_reserve(addr, new_size)); - - /* If the array wasn't our static init one, then free it. We only do - * that before SLAB is available as later on, we don't know whether - * to use kfree or free_bootmem_pages(). Shouldn't be a big deal - * anyways + /* Free old array. We needn't free it if the array is the + * static one */ - if (old_array != memblock_memory_init_regions && - old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_size); + if (*in_slab) + kfree(old_array); + else if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_alloc_size); + + /* Reserve the new array if that comes from the memblock. + * Otherwise, we needn't do it + */ + if (!use_slab) + BUG_ON(memblock_reserve(addr, new_alloc_size)); + + /* Update slab flag */ + *in_slab = use_slab; return 0; } @@ -387,7 +422,7 @@ repeat: */ if (!insert) { while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; @@ -438,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for (i = 0; i < type->cnt; i++) { @@ -528,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Find the first free area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of @@ -604,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). */ @@ -855,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) is a subset of a memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); @@ -867,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size memblock.memory.regions[idx].size) >= end; } +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) intersects a reserved memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b659260c56ad..f72b5e52451a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -59,7 +59,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 -struct mem_cgroup *root_mem_cgroup __read_mostly; +static struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ @@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0; #endif #else -#define do_swap_account (0) +#define do_swap_account 0 #endif @@ -88,18 +88,31 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ - MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ MEM_CGROUP_STAT_NSTATS, }; +static const char * const mem_cgroup_stat_names[] = { + "cache", + "rss", + "mapped_file", + "swap", +}; + enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; + +static const char * const mem_cgroup_events_names[] = { + "pgpgin", + "pgpgout", + "pgfault", + "pgmajfault", +}; + /* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremated by the number of pages. This counter is used for @@ -112,13 +125,14 @@ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; -#define THRESHOLDS_EVENTS_TARGET (128) -#define SOFTLIMIT_EVENTS_TARGET (1024) -#define NUMAINFO_EVENTS_TARGET (1024) +#define THRESHOLDS_EVENTS_TARGET 128 +#define SOFTLIMIT_EVENTS_TARGET 1024 +#define NUMAINFO_EVENTS_TARGET 1024 struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -138,7 +152,6 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; - struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -182,7 +195,7 @@ struct mem_cgroup_threshold { /* For threshold */ struct mem_cgroup_threshold_ary { - /* An array index points to threshold just below usage. */ + /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; @@ -245,8 +258,8 @@ struct mem_cgroup { */ struct rcu_head rcu_freeing; /* - * But when using vfree(), that cannot be done at - * interrupt time, so we must then queue the work. + * We also need some space for a worker in deferred freeing. + * By the time we call it, rcu_freeing is no longer in use. */ struct work_struct work_freeing; }; @@ -305,7 +318,7 @@ struct mem_cgroup { /* * percpu counter. */ - struct mem_cgroup_stat_cpu *stat; + struct mem_cgroup_stat_cpu __percpu *stat; /* * used when a cpu is offlined or other synchronizations * See mem_cgroup_read_stat(). @@ -360,8 +373,8 @@ static bool move_file(void) * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. */ -#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) +#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -377,8 +390,8 @@ enum charge_type { #define _MEM (0) #define _MEMSWAP (1) #define _OOM_TYPE (2) -#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) -#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) +#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) +#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) /* Used for OOM nofiier */ #define OOM_CONTROL (0) @@ -404,6 +417,7 @@ void sock_update_memcg(struct sock *sk) { if (mem_cgroup_sockets_enabled) { struct mem_cgroup *memcg; + struct cg_proto *cg_proto; BUG_ON(!sk->sk_prot->proto_cgroup); @@ -423,9 +437,10 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (!mem_cgroup_is_root(memcg)) { + cg_proto = sk->sk_prot->proto_cgroup(memcg); + if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { mem_cgroup_get(memcg); - sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); + sk->sk_cgrp = cg_proto; } rcu_read_unlock(); } @@ -454,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup); #endif /* CONFIG_INET */ #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ +#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) +static void disarm_sock_keys(struct mem_cgroup *memcg) +{ + if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) + return; + static_key_slow_dec(&memcg_socket_limit_enabled); +} +#else +static void disarm_sock_keys(struct mem_cgroup *memcg) +{ +} +#endif + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -718,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); preempt_enable(); } unsigned long +mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + return mz->lru_size[lru]; +} + +static unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lru_mask) { @@ -770,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + val = __this_cpu_read(memcg->stat->nr_page_events); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ if ((long)next - (long)val < 0) { @@ -1013,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); /** * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg * @zone: zone of the wanted lruvec - * @mem: memcg of the wanted lruvec + * @memcg: memcg of the wanted lruvec * * Returns the lru list vector holding pages for the given @zone and * @mem. This can be the global zone lruvec, if the memory controller @@ -1046,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, */ /** - * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec - * @zone: zone of the page + * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page - * @lru: current lru - * - * This function accounts for @page being added to @lru, and returns - * the lruvec for the given @zone and the memcg @page is charged to. - * - * The callsite is then responsible for physically linking the page to - * the returned lruvec->lists[@lru]. + * @zone: zone of the page */ -struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, - enum lru_list lru) +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; @@ -1071,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, memcg = pc->mem_cgroup; /* - * Surreptitiously switch any uncharged page to root: + * Surreptitiously switch any uncharged offlist page to root: * an uncharged page off lru does nothing to secure * its former mem_cgroup from sudden removal. * @@ -1079,85 +1108,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, * under page_cgroup lock: between them, they make all uses * of pc->mem_cgroup safe. */ - if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) + if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; mz = page_cgroup_zoneinfo(memcg, page); - /* compound_order() is stabilized through lru_lock */ - mz->lru_size[lru] += 1 << compound_order(page); return &mz->lruvec; } /** - * mem_cgroup_lru_del_list - account for removing an lru page - * @page: the page - * @lru: target lru - * - * This function accounts for @page being removed from @lru. + * mem_cgroup_update_lru_size - account for adding or removing an lru page + * @lruvec: mem_cgroup per zone lru vector + * @lru: index of lru list the page is sitting on + * @nr_pages: positive when adding or negative when removing * - * The callsite is then responsible for physically unlinking - * @page->lru. + * This function must be called when a page is added to or removed from an + * lru list. */ -void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) +void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, + int nr_pages) { struct mem_cgroup_per_zone *mz; - struct mem_cgroup *memcg; - struct page_cgroup *pc; + unsigned long *lru_size; if (mem_cgroup_disabled()) return; - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; - VM_BUG_ON(!memcg); - mz = page_cgroup_zoneinfo(memcg, page); - /* huge page split is done under lru_lock. so, we have no races. */ - VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); - mz->lru_size[lru] -= 1 << compound_order(page); -} - -void mem_cgroup_lru_del(struct page *page) -{ - mem_cgroup_lru_del_list(page, page_lru(page)); -} - -/** - * mem_cgroup_lru_move_lists - account for moving a page between lrus - * @zone: zone of the page - * @page: the page - * @from: current lru - * @to: target lru - * - * This function accounts for @page being moved between the lrus @from - * and @to, and returns the lruvec for the given @zone and the memcg - * @page is charged to. - * - * The callsite is then responsible for physically relinking - * @page->lru to the returned lruvec->lists[@to]. - */ -struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, - struct page *page, - enum lru_list from, - enum lru_list to) -{ - /* XXX: Optimize this, especially for @from == @to */ - mem_cgroup_lru_del_list(page, from); - return mem_cgroup_lru_add_list(zone, page, to); + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + lru_size = mz->lru_size + lru; + *lru_size += nr_pages; + VM_BUG_ON((long)(*lru_size) < 0); } /* * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree */ +bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) +{ + if (root_memcg == memcg) + return true; + if (!root_memcg->use_hierarchy || !memcg) + return false; + return css_is_ancestor(&memcg->css, &root_memcg->css); +} + static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { - if (root_memcg != memcg) { - return (root_memcg->use_hierarchy && - css_is_ancestor(&memcg->css, &root_memcg->css)); - } + bool ret; - return true; + rcu_read_lock(); + ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); + rcu_read_unlock(); + return ret; } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) @@ -1195,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) return ret; } -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) +int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { unsigned long inactive_ratio; - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); unsigned long inactive; unsigned long active; unsigned long gb; - inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_INACTIVE_ANON)); - active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_ACTIVE_ANON)); + inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); + active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1218,55 +1218,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) +int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) { unsigned long active; unsigned long inactive; - int zid = zone_idx(zone); - int nid = zone_to_nid(zone); - inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_INACTIVE_FILE)); - active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_ACTIVE_FILE)); + inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); return (active > inactive); } -struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, - struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return &mz->reclaim_stat; -} - -struct zone_reclaim_stat * -mem_cgroup_get_reclaim_stat_from_page(struct page *page) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return NULL; - - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) - return NULL; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - return &mz->reclaim_stat; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup - * @mem: the memory cgroup + * @memcg: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in * pages. @@ -1540,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, /** * test_mem_cgroup_node_reclaimable - * @mem: the target memcg + * @memcg: the target memcg * @nid: the node ID to be checked. * @noswap : specify true here if the user wants flle only information. * @@ -1634,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * unused nodes. But scan_nodes is lazily updated and may not cotain * enough new information. We need to do double check. */ -bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { int nid; @@ -1669,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return 0; } -bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); } @@ -1843,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ -bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, + int order) { struct oom_wait_info owait; bool locked, need_to_kill; @@ -1992,7 +1961,7 @@ struct memcg_stock_pcp { unsigned int nr_pages; struct work_struct work; unsigned long flags; -#define FLUSHING_CACHED_CHARGE (0) +#define FLUSHING_CACHED_CHARGE 0 }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); @@ -2139,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) int i; spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { long x = per_cpu(memcg->stat->count[i], cpu); per_cpu(memcg->stat->count[i], cpu) = 0; @@ -2427,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, } /* + * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. + * This is useful when moving usage to parent cgroup. + */ +static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + unsigned long bytes = nr_pages * PAGE_SIZE; + + if (mem_cgroup_is_root(memcg)) + return; + + res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); + if (do_swap_account) + res_counter_uncharge_until(&memcg->memsw, + memcg->memsw.parent, bytes); +} + +/* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if * it's concern. (dropping refcnt from swap can be called against removed @@ -2481,6 +2468,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, { struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); + struct lruvec *lruvec; bool was_on_lru = false; bool anon; @@ -2503,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, zone = page_zone(page); spin_lock_irq(&zone->lru_lock); if (PageLRU(page)) { + lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); ClearPageLRU(page); - del_page_from_lru_list(zone, page, page_lru(page)); + del_page_from_lru_list(page, lruvec, page_lru(page)); was_on_lru = true; } } @@ -2522,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, if (lrucare) { if (was_on_lru) { + lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - add_page_to_lru_list(zone, page, page_lru(page)); + add_page_to_lru_list(page, lruvec, page_lru(page)); } spin_unlock_irq(&zone->lru_lock); } @@ -2547,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) +#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2578,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head) * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. - * @uncharge: whether we should call uncharge and css_put against @from. * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) * - compound_lock is held when nr_pages > 1 * - * This function doesn't do "charge" nor css_get to new cgroup. It should be - * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is - * true, this function does "uncharge" from old cgroup, but it doesn't if - * @uncharge is false, so a caller should do "uncharge". + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, unsigned int nr_pages, struct page_cgroup *pc, struct mem_cgroup *from, - struct mem_cgroup *to, - bool uncharge) + struct mem_cgroup *to) { unsigned long flags; int ret; @@ -2628,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page, preempt_enable(); } mem_cgroup_charge_statistics(from, anon, -nr_pages); - if (uncharge) - /* This is not "cancel", but cancel_charge does all we need. */ - __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2664,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page, struct mem_cgroup *child, gfp_t gfp_mask) { - struct cgroup *cg = child->css.cgroup; - struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; unsigned int nr_pages; unsigned long uninitialized_var(flags); int ret; /* Is ROOT ? */ - if (!pcg) + if (mem_cgroup_is_root(child)) return -EINVAL; ret = -EBUSY; @@ -2683,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page, nr_pages = hpage_nr_pages(page); - parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); - if (ret) - goto put_back; + parent = parent_mem_cgroup(child); + /* + * If no parent, move charges to root cgroup. + */ + if (!parent) + parent = root_mem_cgroup; if (nr_pages > 1) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); - if (ret) - __mem_cgroup_cancel_charge(parent, nr_pages); + ret = mem_cgroup_move_account(page, nr_pages, + pc, child, parent); + if (!ret) + __mem_cgroup_cancel_local_charge(child, nr_pages); if (nr_pages > 1) compound_unlock_irqrestore(page, flags); -put_back: putback_lru_page(page); put: put_page(page); @@ -2845,24 +2828,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; - struct mem_cgroup *swap_memcg; - unsigned short id; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - swap_memcg = mem_cgroup_lookup(id); - if (swap_memcg) { - /* - * This recorded memcg can be obsolete one. So, avoid - * calling css_tryget - */ - if (!mem_cgroup_is_root(swap_memcg)) - res_counter_uncharge(&swap_memcg->memsw, - PAGE_SIZE); - mem_cgroup_swap_statistics(swap_memcg, false); - mem_cgroup_put(swap_memcg); - } - rcu_read_unlock(); + mem_cgroup_uncharge_swap(ent); } /* * At swapin, we may charge account against cgroup which has no tasks. @@ -3155,7 +3121,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * @entry: swap entry to be moved * @from: mem_cgroup which the entry is moved from * @to: mem_cgroup which the entry is moved to - * @need_fixup: whether we should fixup res_counters and refcounts. * * It succeeds only when the swap_cgroup's record for this entry is the same * as the mem_cgroup's id of @from. @@ -3166,7 +3131,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * both res and memsw, and called css_get(). */ static int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) + struct mem_cgroup *from, struct mem_cgroup *to) { unsigned short old_id, new_id; @@ -3185,24 +3150,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * swap-in, the refcount of @to might be decreased to 0. */ mem_cgroup_get(to); - if (need_fixup) { - if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE); - mem_cgroup_put(from); - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - if (!mem_cgroup_is_root(to)) - res_counter_uncharge(&to->res, PAGE_SIZE); - } return 0; } return -EINVAL; } #else static inline int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) + struct mem_cgroup *from, struct mem_cgroup *to) { return -EINVAL; } @@ -3363,7 +3317,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; @@ -3373,11 +3327,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, pc = lookup_page_cgroup(oldpage); /* fix accounting on old pages */ lock_page_cgroup(pc); - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, false, -1); - ClearPageCgroupUsed(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, false, -1); + ClearPageCgroupUsed(pc); + } unlock_page_cgroup(pc); + /* + * When called from shmem_replace_page(), in some cases the + * oldpage has already been charged, and in some cases not. + */ + if (!memcg) + return; + if (PageSwapBacked(oldpage)) type = MEM_CGROUP_CHARGE_TYPE_SHMEM; @@ -3793,7 +3756,7 @@ try_to_free: goto move_account; } -int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) +static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) { return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); } @@ -3873,14 +3836,21 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) +static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, + struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + char str[64]; u64 val; - int type, name; + int type, name, len; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); + + if (!do_swap_account && type == _MEMSWAP) + return -EOPNOTSUPP; + switch (type) { case _MEM: if (name == RES_USAGE) @@ -3897,7 +3867,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) default: BUG(); } - return val; + + len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); + return simple_read_from_buffer(buf, nbytes, ppos, str, len); } /* * The user of this function is... @@ -3913,6 +3885,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); + + if (!do_swap_account && type == _MEMSWAP) + return -EOPNOTSUPP; + switch (name) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ @@ -3978,12 +3954,15 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); int type, name; - memcg = mem_cgroup_from_cont(cont); type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); + + if (!do_swap_account && type == _MEMSWAP) + return -EOPNOTSUPP; + switch (name) { case RES_MAX_USAGE: if (type == _MEM) @@ -4035,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, } #endif - -/* For read statistics */ -enum { - MCS_CACHE, - MCS_RSS, - MCS_FILE_MAPPED, - MCS_PGPGIN, - MCS_PGPGOUT, - MCS_SWAP, - MCS_PGFAULT, - MCS_PGMAJFAULT, - MCS_INACTIVE_ANON, - MCS_ACTIVE_ANON, - MCS_INACTIVE_FILE, - MCS_ACTIVE_FILE, - MCS_UNEVICTABLE, - NR_MCS_STAT, -}; - -struct mcs_total_stat { - s64 stat[NR_MCS_STAT]; -}; - -struct { - char *local_name; - char *total_name; -} memcg_stat_strings[NR_MCS_STAT] = { - {"cache", "total_cache"}, - {"rss", "total_rss"}, - {"mapped_file", "total_mapped_file"}, - {"pgpgin", "total_pgpgin"}, - {"pgpgout", "total_pgpgout"}, - {"swap", "total_swap"}, - {"pgfault", "total_pgfault"}, - {"pgmajfault", "total_pgmajfault"}, - {"inactive_anon", "total_inactive_anon"}, - {"active_anon", "total_active_anon"}, - {"inactive_file", "total_inactive_file"}, - {"active_file", "total_active_file"}, - {"unevictable", "total_unevictable"} -}; - - -static void -mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) -{ - s64 val; - - /* per cpu stat */ - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); - s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); - s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); - s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); - s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); - s->stat[MCS_PGPGOUT] += val; - if (do_swap_account) { - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); - s->stat[MCS_SWAP] += val * PAGE_SIZE; - } - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); - s->stat[MCS_PGFAULT] += val; - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); - s->stat[MCS_PGMAJFAULT] += val; - - /* per zone stat */ - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); - s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); - s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); - s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); - s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); - s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; -} - -static void -mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) -{ - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - mem_cgroup_get_local_stat(iter, s); -} - #ifdef CONFIG_NUMA -static int mem_control_numa_stat_show(struct seq_file *m, void *arg) +static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) { int nid; unsigned long total_nr, file_nr, anon_nr, unevictable_nr; unsigned long node_nr; - struct cgroup *cont = m->private; struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); @@ -4172,64 +4061,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) } #endif /* CONFIG_NUMA */ +static const char * const mem_cgroup_lru_names[] = { + "inactive_anon", + "active_anon", + "inactive_file", + "active_file", + "unevictable", +}; + +static inline void mem_cgroup_lru_names_not_uptodate(void) +{ + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); +} + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, - struct cgroup_map_cb *cb) + struct seq_file *m) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct mcs_total_stat mystat; - int i; - - memset(&mystat, 0, sizeof(mystat)); - mem_cgroup_get_local_stat(memcg, &mystat); + struct mem_cgroup *mi; + unsigned int i; - - for (i = 0; i < NR_MCS_STAT; i++) { - if (i == MCS_SWAP && !do_swap_account) + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) continue; - cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); + seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], + mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) + seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], + mem_cgroup_read_events(memcg, i)); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], + mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + /* Hierarchical information */ { unsigned long long limit, memsw_limit; memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); - cb->fill(cb, "hierarchical_memory_limit", limit); + seq_printf(m, "hierarchical_memory_limit %llu\n", limit); if (do_swap_account) - cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); + seq_printf(m, "hierarchical_memsw_limit %llu\n", + memsw_limit); } - memset(&mystat, 0, sizeof(mystat)); - mem_cgroup_get_total_stat(memcg, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) { - if (i == MCS_SWAP && !do_swap_account) + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + long long val = 0; + + if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) continue; - cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; + seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); + } + + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_read_events(mi, i); + seq_printf(m, "total_%s %llu\n", + mem_cgroup_events_names[i], val); + } + + for (i = 0; i < NR_LRU_LISTS; i++) { + unsigned long long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); } #ifdef CONFIG_DEBUG_VM { int nid, zid; struct mem_cgroup_per_zone *mz; + struct zone_reclaim_stat *rstat; unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0}; for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_zoneinfo(memcg, nid, zid); + rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += - mz->reclaim_stat.recent_rotated[0]; - recent_rotated[1] += - mz->reclaim_stat.recent_rotated[1]; - recent_scanned[0] += - mz->reclaim_stat.recent_scanned[0]; - recent_scanned[1] += - mz->reclaim_stat.recent_scanned[1]; + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; } - cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); - cb->fill(cb, "recent_rotated_file", recent_rotated[1]); - cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); - cb->fill(cb, "recent_scanned_file", recent_scanned[1]); + seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); + seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); + seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); + seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); } #endif @@ -4291,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) usage = mem_cgroup_usage(memcg, swap); /* - * current_threshold points to threshold just below usage. + * current_threshold points to threshold just below or equal to usage. * If it's not true, a threshold was crossed after last * call of __mem_cgroup_threshold(). */ @@ -4417,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, /* Find current threshold */ new->current_threshold = -1; for (i = 0; i < size; i++) { - if (new->entries[i].threshold < usage) { + if (new->entries[i].threshold <= usage) { /* * new->current_threshold will not be used until * rcu_assign_pointer(), so it's safe to increment * it here. */ ++new->current_threshold; - } + } else + break; } /* Free old spare buffer and save old primary buffer as spare */ @@ -4493,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, continue; new->entries[j] = thresholds->primary->entries[i]; - if (new->entries[j].threshold < usage) { + if (new->entries[j].threshold <= usage) { /* * new->current_threshold will not be used * until rcu_assign_pointer(), so it's safe to increment @@ -4607,46 +4533,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, return 0; } -#ifdef CONFIG_NUMA -static const struct file_operations mem_control_numa_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_numa_stat_open(struct inode *unused, struct file *file) -{ - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; - - file->f_op = &mem_control_numa_stat_file_operations; - return single_open(file, mem_control_numa_stat_show, cont); -} -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM -static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { - /* - * Part of this would be better living in a separate allocation - * function, leaving us with just the cgroup tree population work. - * We, however, depend on state such as network's proto_list that - * is only initialized after cgroup creation. I found the less - * cumbersome way to deal with it to defer it all to populate time - */ - return mem_cgroup_sockets_init(cont, ss); + return mem_cgroup_sockets_init(memcg, ss); }; -static void kmem_cgroup_destroy(struct cgroup *cont) +static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { - mem_cgroup_sockets_destroy(cont); + mem_cgroup_sockets_destroy(memcg); } #else -static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { return 0; } -static void kmem_cgroup_destroy(struct cgroup *cont) +static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { } #endif @@ -4655,7 +4558,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, .register_event = mem_cgroup_usage_register_event, .unregister_event = mem_cgroup_usage_unregister_event, }, @@ -4663,29 +4566,29 @@ static struct cftype mem_cgroup_files[] = { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, { .name = "stat", - .read_map = mem_control_stat_show, + .read_seq_string = mem_control_stat_show, }, { .name = "force_empty", @@ -4717,18 +4620,14 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_NUMA { .name = "numa_stat", - .open = mem_control_numa_stat_open, - .mode = S_IRUGO, + .read_seq_string = mem_control_numa_stat_show, }, #endif -}; - #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, .register_event = mem_cgroup_usage_register_event, .unregister_event = mem_cgroup_usage_unregister_event, }, @@ -4736,41 +4635,28 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read = mem_cgroup_read, }, -}; - -static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) -{ - if (!do_swap_account) - return 0; - return cgroup_add_files(cont, ss, memsw_cgroup_files, - ARRAY_SIZE(memsw_cgroup_files)); -}; -#else -static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) -{ - return 0; -} #endif + { }, /* terminate */ +}; static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; - enum lru_list lru; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -4788,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - for_each_lru(lru) - INIT_LIST_HEAD(&mz->lruvec.lists[lru]); + lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; @@ -4832,23 +4717,40 @@ out_free: } /* - * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, * but in process context. The work_freeing structure is overlaid * on the rcu_freeing structure, which itself is overlaid on memsw. */ -static void vfree_work(struct work_struct *work) +static void free_work(struct work_struct *work) { struct mem_cgroup *memcg; + int size = sizeof(struct mem_cgroup); memcg = container_of(work, struct mem_cgroup, work_freeing); - vfree(memcg); + /* + * We need to make sure that (at least for now), the jump label + * destruction code runs outside of the cgroup lock. This is because + * get_online_cpus(), which is called from the static_branch update, + * can't be called inside the cgroup_lock. cpusets are the ones + * enforcing this dependency, so if they ever change, we might as well. + * + * schedule_work() will guarantee this happens. Be careful if you need + * to move this code around, and make sure it is outside + * the cgroup_lock. + */ + disarm_sock_keys(memcg); + if (size < PAGE_SIZE) + kfree(memcg); + else + vfree(memcg); } -static void vfree_rcu(struct rcu_head *rcu_head) + +static void free_rcu(struct rcu_head *rcu_head) { struct mem_cgroup *memcg; memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, vfree_work); + INIT_WORK(&memcg->work_freeing, free_work); schedule_work(&memcg->work_freeing); } @@ -4874,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); - if (sizeof(struct mem_cgroup) < PAGE_SIZE) - kfree_rcu(memcg, rcu_freeing); - else - call_rcu(&memcg->rcu_freeing, vfree_rcu); + call_rcu(&memcg->rcu_freeing, free_rcu); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -5016,6 +4915,17 @@ mem_cgroup_create(struct cgroup *cont) memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); + + error = memcg_init_kmem(memcg, &mem_cgroup_subsys); + if (error) { + /* + * We call put now because our (and parent's) refcnts + * are already in place. mem_cgroup_put() will internally + * call __mem_cgroup_free, so return directly + */ + mem_cgroup_put(memcg); + return ERR_PTR(error); + } return &memcg->css; free_out: __mem_cgroup_free(memcg); @@ -5033,28 +4943,11 @@ static void mem_cgroup_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(cont); + kmem_cgroup_destroy(memcg); mem_cgroup_put(memcg); } -static int mem_cgroup_populate(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - int ret; - - ret = cgroup_add_files(cont, ss, mem_cgroup_files, - ARRAY_SIZE(mem_cgroup_files)); - - if (!ret) - ret = register_memsw_files(cont, ss); - - if (!ret) - ret = register_kmem_files(cont, ss); - - return ret; -} - #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ #define PRECHARGE_COUNT_AT_ONCE 256 @@ -5147,7 +5040,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return NULL; if (PageAnon(page)) { /* we don't move shared anon */ - if (!move_anon() || page_mapcount(page) > 2) + if (!move_anon()) return NULL; } else if (!move_file()) /* we ignore mapcount for file pages */ @@ -5158,32 +5051,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return page; } +#ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { - int usage_count; struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); if (!move_anon() || non_swap_entry(ent)) return NULL; - usage_count = mem_cgroup_count_swap_user(ent, &page); - if (usage_count > 1) { /* we don't move shared anon */ - if (page) - put_page(page); - return NULL; - } + /* + * Because lookup_swap_cache() updates some statistics counter, + * we call find_get_page() with swapper_space directly. + */ + page = find_get_page(&swapper_space, ent.val); if (do_swap_account) entry->val = ent.val; return page; } +#else +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + return NULL; +} +#endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { struct page *page = NULL; - struct inode *inode; struct address_space *mapping; pgoff_t pgoff; @@ -5192,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, if (!move_file()) return NULL; - inode = vma->vm_file->f_path.dentry->d_inode; mapping = vma->vm_file->f_mapping; if (pte_none(ptent)) pgoff = linear_page_index(vma, addr); @@ -5481,7 +5378,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * part of thp split is not executed yet. */ if (pmd_trans_huge_lock(pmd, vma) == 1) { - if (!mc.precharge) { + if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -5491,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (!isolate_lru_page(page)) { pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, - pc, mc.from, mc.to, - false)) { + pc, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } @@ -5522,7 +5418,7 @@ retry: goto put; pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, 1, pc, - mc.from, mc.to, false)) { + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5533,8 +5429,7 @@ put: /* get_mctgt_type() gets the page */ break; case MC_TARGET_SWAP: ent = target.ent; - if (!mem_cgroup_move_swap_account(ent, - mc.from, mc.to, false)) { + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { mc.precharge--; /* we fixup refcnts and charges later. */ mc.moved_swap++; @@ -5610,7 +5505,6 @@ static void mem_cgroup_move_task(struct cgroup *cont, if (mm) { if (mc.to) mem_cgroup_move_charge(mm); - put_swap_token(mm); mmput(mm); } if (mc.to) @@ -5638,12 +5532,13 @@ struct cgroup_subsys mem_cgroup_subsys = { .create = mem_cgroup_create, .pre_destroy = mem_cgroup_pre_destroy, .destroy = mem_cgroup_destroy, - .populate = mem_cgroup_populate, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, + .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, + .__DEPRECATED_clear_css_refs = true, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97cc2733551a..6de0d613bbe6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int doit, int trapno, +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, int fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { + if (forcekill) { /* * In case something went wrong with munmapping * make sure the process doesn't catch the @@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int kill = 1; + int kill = 1, forcekill; struct page *hpage = compound_head(p); struct page *ppage; @@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(hpage); - if (!PageDirty(hpage) && mapping && + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && mapping_cap_writeback_dirty(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); @@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely + * was dirty or the process is not restartable, + * otherwise the tokill list is merely * freed. When there was a problem unmapping earlier * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs(&tokill, !!PageDirty(ppage), trapno, + forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); return ret; @@ -1388,23 +1390,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) */ if (!get_page_unless_zero(compound_head(p))) { if (PageHuge(p)) { - pr_info("get_any_page: %#lx free huge page\n", pfn); + pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = dequeue_hwpoisoned_huge_page(compound_head(p)); } else if (is_free_buddy_page(p)) { - pr_info("get_any_page: %#lx free buddy page\n", pfn); + pr_info("%s: %#lx free buddy page\n", __func__, pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", - pfn, p->flags); + pr_info("%s: %#lx: unknown zero refcount page type %lx\n", + __func__, pfn, p->flags); ret = -EIO; } } else { /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p); + unset_migratetype_isolate(p, MIGRATE_MOVABLE); unlock_memory_hotplug(); return ret; } @@ -1431,8 +1433,8 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, - true); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, + MIGRATE_SYNC); if (ret) { struct page *page1, *page2; list_for_each_entry_safe(page1, page2, &pagelist, lru) @@ -1561,7 +1563,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, MIGRATE_SYNC); + false, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index 6105f475fa86..91f69459d3e8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->mm = mm; tlb->fullmm = fullmm; + tlb->start = -1UL; + tlb->end = 0; tlb->need_flush = 0; tlb->fast_mode = (num_possible_cpus() == 1); tlb->local.next = NULL; @@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e { struct mmu_gather_batch *batch, *next; + tlb->start = start; + tlb->end = end; tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -1204,6 +1208,11 @@ again: */ if (force_flush) { force_flush = 0; + +#ifdef HAVE_GENERIC_MMU_GATHER + tlb->start = addr; + tlb->end = end; +#endif tlb_flush_mmu(tlb); if (addr != end) goto again; @@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; @@ -1295,7 +1312,7 @@ static void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); @@ -1307,8 +1324,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (end <= vma->vm_start) return; - if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; + if (vma->vm_file) + uprobe_munmap(vma, start, end); if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); @@ -1339,8 +1356,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping - * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here - * @details: details of nonlinear truncation or shared cache invalidation * * Unmap all pages in the vma list. * @@ -1355,40 +1370,40 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *details) + unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) - unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, - details); + unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); } /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * @start: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation * * Caller must protect the VMA list */ -void zap_page_range(struct vm_area_struct *vma, unsigned long address, +void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - unsigned long end = address + size; - unsigned long nr_accounted = 0; + unsigned long end = start + size; lru_add_drain(); tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - tlb_finish_mmu(&tlb, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + unmap_single_vma(&tlb, vma, start, end, details); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); } /** @@ -1406,13 +1421,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; unsigned long end = address + size; - unsigned long nr_accounted = 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); - unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); + unmap_single_vma(&tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(mm, address, end); tlb_finish_mmu(&tlb, address, end); } @@ -2911,7 +2925,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2941,6 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } locked = lock_page_or_retry(page, mm, flags); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; @@ -3489,6 +3503,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); +retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3502,13 +3517,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd, flags); } else { pmd_t orig_pmd = *pmd; + int ret; + barrier(); if (pmd_trans_huge(orig_pmd)) { if (flags & FAULT_FLAG_WRITE && !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); + !pmd_trans_splitting(orig_pmd)) { + ret = do_huge_pmd_wp_page(mm, vma, address, pmd, + orig_pmd); + /* + * If COW results in an oom, the huge pmd will + * have been split, so retry the fault on the + * pte for a smaller charge. + */ + if (unlikely(ret & VM_FAULT_OOM)) + goto retry; + return ret; + } return 0; } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6629fafd6ce4..427bb291dd0f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->end = start + size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %llx - %llx cannot be added\n", - (unsigned long long)res->start, (unsigned long long)res->end); + printk("System RAM resource %pR cannot be added\n", res); kfree(res); res = NULL; } @@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) online_pages_range); if (ret) { mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages %lx at %lx failed\n", - nr_pages, pfn); + printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) + << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); unlock_memory_hotplug(); return ret; @@ -617,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size) pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; if (!pgdat) - goto out; + goto error; new_pgdat = 1; } @@ -891,7 +892,7 @@ static int __ref offline_pages(unsigned long start_pfn, nr_pages = end_pfn - start_pfn; /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn); + ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); if (ret) goto out; @@ -956,7 +957,7 @@ repeat: We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; @@ -977,11 +978,12 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining %lx to %lx failed\n", - start_pfn, end_pfn); + printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: unlock_memory_hotplug(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b19569137529..bd92431d4c49 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == 0 && + if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } -/* Apply policy to a single VMA */ -static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) -{ - int err = 0; - struct mempolicy *old = vma->vm_policy; - - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - - if (vma->vm_ops && vma->vm_ops->set_policy) - err = vma->vm_ops->set_policy(vma, new); - if (!err) { - mpol_get(new); - vma->vm_policy = new; - mpol_put(old); - } - return err; -} - /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) @@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - err = policy_vma(vma, new_pol); - if (err) - goto out; + + /* + * Apply policy to a single VMA. The reference counting of + * policy for vma_policy linkages has already been handled by + * vma_merge and split_vma as necessary. If this is a shared + * policy then ->set_policy will increment the reference count + * for an sp node. + */ + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + if (vma->vm_ops && vma->vm_ops->set_policy) { + err = vma->vm_ops->set_policy(vma, new_pol); + if (err) + goto out; + } } out: @@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * * Returns the number of page that could not be moved. */ -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { int busy = 0; int err; @@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm, down_read(&mm->mmap_sem); - err = migrate_vmas(mm, from_nodes, to_nodes, flags); + err = migrate_vmas(mm, from, to, flags); if (err) goto out; @@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm, * moved to an empty node, then there is nothing left worth migrating. */ - tmp = *from_nodes; + tmp = *from; while (!nodes_empty(tmp)) { int s,d; int source = -1; int dest = 0; for_each_node_mask(s, tmp) { - d = node_remap(s, *from_nodes, *to_nodes); + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) + continue; + + d = node_remap(s, *from, *to); if (s == d) continue; @@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, { } -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { return -ENOSYS; } @@ -1164,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, true); + false, MIGRATE_SYNC); if (nr_failed) putback_lru_pages(&pagelist); } @@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, * userid as the target process. */ tcred = __task_cred(task); - if (cred->euid != tcred->suid && cred->euid != tcred->uid && - cred->uid != tcred->suid && cred->uid != tcred->uid && + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; @@ -1589,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy) * task can change it's policy. The system default policy requires no * such protection. */ -unsigned slab_node(struct mempolicy *policy) +unsigned slab_node(void) { + struct mempolicy *policy; + + if (in_interrupt()) + return numa_node_id(); + + policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) return numa_node_id(); diff --git a/mm/migrate.c b/mm/migrate.c index 11072383ae12..be26d5cbe56b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) * is actually a signal that all of the page has become dirty. * Whereas only part of our page may be dirty. */ - __set_page_dirty_nobuffers(newpage); + if (PageSwapBacked(page)) + SetPageDirty(newpage); + else + __set_page_dirty_nobuffers(newpage); } mlock_migrate_page(newpage, page); @@ -1371,8 +1374,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, * userid as the target process. */ tcred = __task_cred(task); - if (cred->euid != tcred->suid && cred->euid != tcred->uid && - cred->uid != tcred->suid && cred->uid != tcred->uid && + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; diff --git a/mm/mmap.c b/mm/mmap.c index 848ef52d9603..4fe2697339ed 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -30,6 +30,7 @@ #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> +#include <linux/uprobes.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -546,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) + if (!(vma->vm_flags & VM_NONLINEAR)) { root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + + if (adjust_next) + uprobe_munmap(next, next->vm_start, + next->vm_end); + } + mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* @@ -617,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); + if (root) { + uprobe_mmap(vma); + + if (adjust_next) + uprobe_mmap(next); + } + if (remove_next) { if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); if (next->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); @@ -638,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end); goto again; } } + if (insert && file) + uprobe_mmap(insert); validate_mm(mm); @@ -953,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint) * The caller must hold down_write(¤t->mm->mmap_sem). */ -static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; - int error; - unsigned long reqprot = prot; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1083,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - error = security_file_mmap(file, reqprot, prot, flags, addr, 0); - if (error) - return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff); } -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - if (unlikely(offset + PAGE_ALIGN(len) < offset)) - return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) - return -EINVAL; - return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -} -EXPORT_SYMBOL(do_mmap); - -unsigned long vm_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flag, offset); - up_write(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(vm_mmap); - SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) @@ -1147,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); out: @@ -1371,6 +1354,11 @@ out: mm->locked_vm += (len >> PAGE_SHIFT); } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); + + if (file && uprobe_mmap(vma)) + /* matching probes but cannot insert */ + goto unmap_and_free_vma; + return addr; unmap_and_free_vma: @@ -1606,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr & ~PAGE_MASK) return -EINVAL; - return arch_rebalance_pgtables(addr, len); + addr = arch_rebalance_pgtables(addr, len); + error = security_mmap_addr(addr); + return error ? error : addr; } EXPORT_SYMBOL(get_unmapped_area); @@ -1616,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; - if (mm) { - /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node * rb_node; - - rb_node = mm->mm_rb.rb_node; - vma = NULL; - - while (rb_node) { - struct vm_area_struct * vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ + return NULL; + + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + struct rb_node *rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct *vma_tmp; + + vma_tmp = rb_entry(rb_node, + struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + if (vma) + mm->mmap_cache = vma; } return vma; } @@ -1795,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; address &= PAGE_MASK; - error = security_file_mmap(NULL, 0, 0, 0, address, 1); + error = security_mmap_addr(address); if (error) return error; @@ -1889,15 +1880,20 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) */ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { + unsigned long nr_accounted = 0; + /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); do { long nrpages = vma_pages(vma); + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += nrpages; mm->total_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); + vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -1912,13 +1908,11 @@ static void unmap_region(struct mm_struct *mm, { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; struct mmu_gather tlb; - unsigned long nr_accounted = 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); + unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : 0); tlb_finish_mmu(&tlb, start, end); @@ -2132,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } -EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { @@ -2180,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (!len) return addr; - error = security_file_mmap(NULL, 0, 0, 0, addr, 1); - if (error) - return error; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); @@ -2305,8 +2294,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); + unmap_vmas(&tlb, vma, 0, -1); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(&tlb, 0, -1); @@ -2315,8 +2303,12 @@ void exit_mmap(struct mm_struct *mm) * Walk the list again, actually closing and freeing it, * with preemption enabled, without holding any MM locks. */ - while (vma) + while (vma) { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); vma = remove_vma(vma); + } + vm_unacct_memory(nr_accounted); BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } @@ -2352,6 +2344,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; + vma_link(mm, vma, prev, rb_link, rb_parent); return 0; } @@ -2421,6 +2414,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_pgoff = pgoff; if (new_vma->vm_file) { get_file(new_vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); } @@ -2525,10 +2519,6 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); - if (ret) - goto out; - ret = insert_vm_struct(mm, vma); if (ret) goto out; diff --git a/mm/mmzone.c b/mm/mmzone.c index 7cf7b7ddc7c5..6830eab5bf09 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +void lruvec_init(struct lruvec *lruvec, struct zone *zone) +{ + enum lru_list lru; + + memset(lruvec, 0, sizeof(struct lruvec)); + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + lruvec->zone = zone; +#endif +} diff --git a/mm/mremap.c b/mm/mremap.c index db8d983b5a7d..21fed202ddad 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - ret = do_munmap(mm, new_addr, new_len); if (ret) goto out; @@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise * This option implies MREMAP_MAYMOVE. */ -unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; + down_write(¤t->mm->mmap_sem); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) goto out; @@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr, goto out; } - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); - return ret; -} - -SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, - unsigned long, new_len, unsigned long, flags, - unsigned long, new_addr) -{ - unsigned long ret; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); up_write(¤t->mm->mmap_sem); return ret; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 1983fb1c7026..405573010f99 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn > end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + unsigned long __init free_low_memory_core_early(int nodeid) { unsigned long count = 0; - phys_addr_t start, end; + phys_addr_t start, end, size; u64 i; - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } - } + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + count += __free_memory_core(start, end); + + /* free range that is used for reserved array if we allocate it */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } @@ -274,86 +282,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) { void *ptr; - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - again: ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); + goal, limit); if (ptr) return ptr; ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); - if (!ptr && goal) { + goal, limit); + if (ptr) + return ptr; + + if (goal) { goal = 0; goto again; } - return ptr; + + return NULL; } -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(pgdat, size, align, goal); + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -#ifdef CONFIG_SPARSEMEM -/** - * alloc_bootmem_section - allocate boot memory from a specific section - * @size: size of the request in bytes - * @section_nr: sparse map section to allocate from - * - * Return NULL on failure. - */ -void * __init alloc_bootmem_section(unsigned long size, - unsigned long section_nr) +void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) { - unsigned long pfn, goal, limit; + void *ptr; - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); + if (ptr) + return ptr; - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } -#endif -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); +} - return __alloc_bootmem_nopanic(size, align, goal); +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_node(pgdat, size, align, goal); } #ifndef ARCH_LOW_ADDRESS_LIMIT @@ -397,16 +404,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - - return __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/nommu.c b/mm/nommu.c index bb8f4f004a82..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file, unsigned long *_capabilities) { unsigned long capabilities, rlen; - unsigned long reqprot = prot; int ret; /* do the simple checks first */ @@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file, } /* allow the security API to have its say */ - ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); + ret = security_mmap_addr(addr); if (ret < 0) return ret; @@ -1233,7 +1232,7 @@ enomem: /* * handle mapping creation for uClinux */ -static unsigned long do_mmap_pgoff(struct file *file, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1471,32 +1470,6 @@ error_getting_region: return -ENOMEM; } -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - if (unlikely(offset + PAGE_ALIGN(len) < offset)) - return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) - return -EINVAL; - return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -} -EXPORT_SYMBOL(do_mmap); - -unsigned long vm_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flag, offset); - up_write(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(vm_mmap); - SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) @@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46bf2ed5594c..ac300c99baf6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -180,10 +180,11 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, - const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, + const nodemask_t *nodemask, unsigned long totalpages) { long points; + long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -192,27 +193,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + adj = p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } /* - * The memory controller may have a limit of 0 bytes, so avoid a divide - * by zero, if necessary. - */ - if (!totalpages) - totalpages = 1; - - /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes; - points += get_mm_counter(p->mm, MM_SWAPENTS); - - points *= 1000; - points /= totalpages; + points = get_mm_rss(p->mm) + p->mm->nr_ptes + + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); /* @@ -220,23 +212,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= 30; + adj -= 30; - /* - * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may - * either completely disable oom killing or always prefer a certain - * task. - */ - points += p->signal->oom_score_adj; + /* Normalize to oom_score_adj units */ + adj *= totalpages / 1000; + points += adj; /* - * Never return 0 for an eligible task that may be killed since it's - * possible that no single user task uses more than 0.1% of memory and - * no single admin tasks uses more than 3.0%. + * Never return 0 for an eligible task regardless of the root bonus and + * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - if (points <= 0) - return 1; - return (points < 1000) ? points : 1000; + return points > 0 ? points : 1; } /* @@ -314,7 +300,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, { struct task_struct *g, *p; struct task_struct *chosen = NULL; - *ppoints = 0; + unsigned long chosen_points = 0; do_each_thread(g, p) { unsigned int points; @@ -354,7 +340,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (p == current) { chosen = p; - *ppoints = 1000; + chosen_points = ULONG_MAX; } else if (!force_kill) { /* * If this task is not being ptraced on exit, @@ -367,18 +353,19 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, } points = oom_badness(p, memcg, nodemask, totalpages); - if (points > *ppoints) { + if (points > chosen_points) { chosen = p; - *ppoints = points; + chosen_points = points; } } while_each_thread(g, p); + *ppoints = chosen_points * 1000 / totalpages; return chosen; } /** * dump_tasks - dump current memory state of all system tasks - * @mem: current's memory controller, if constrained + * @memcg: current's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * Dumps the current memory state of all eligible tasks. Tasks not in the same @@ -410,8 +397,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas } pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", - task->pid, task_uid(task), task->tgid, - task->mm->total_vm, get_mm_rss(task->mm), + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), task_cpu(task), task->signal->oom_adj, task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -572,7 +559,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; + limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; read_lock(&tasklist_lock); p = select_bad_process(&points, limit, memcg, NULL, false); if (p && PTR_ERR(p) != -1UL) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 26adea8ca2e7..e5363f34e025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> +#include <linux/timer.h> #include <trace/events/writeback.h> /* @@ -135,7 +136,20 @@ unsigned long global_dirty_limit; * measured in page writeback completions. * */ -static struct prop_descriptor vm_completions; +static struct fprop_global writeout_completions; + +static void writeout_period(unsigned long t); +/* Timer for aging of writeout_completions */ +static struct timer_list writeout_period_timer = + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); +static unsigned long writeout_period_time = 0; + +/* + * Length of period for aging writeout fractions of bdis. This is an + * arbitrarily chosen number. The longer the period, the slower fractions will + * reflect changes in current writeout rate. + */ +#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) /* * Work out the current dirty-memory clamping and background writeout @@ -204,7 +218,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) * Returns the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ -unsigned long global_dirtyable_memory(void) +static unsigned long global_dirtyable_memory(void) { unsigned long x; @@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) zone_page_state(zone, NR_WRITEBACK) <= limit; } -/* - * couple the period to the dirty_ratio: - * - * period/2 ~ roundup_pow_of_two(dirty limit) - */ -static int calc_period_shift(void) -{ - unsigned long dirty_total; - - if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; - else - dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / - 100; - return 2 + ilog2(dirty_total - 1); -} - -/* - * update the period when the dirty threshold changes. - */ -static void update_completion_period(void) -{ - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - - writeback_set_ratelimit(); -} - int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; @@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } +static unsigned long wp_next_time(unsigned long cur_time) +{ + cur_time += VM_COMPLETIONS_PERIOD_LEN; + /* 0 has a special meaning... */ + if (!cur_time) + return 1; + return cur_time; +} + /* * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). @@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { __inc_bdi_stat(bdi, BDI_WRITTEN); - __prop_inc_percpu_max(&vm_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, + bdi->max_prop_frac); + /* First event after period switching was turned off? */ + if (!unlikely(writeout_period_time)) { + /* + * We can race with other __bdi_writeout_inc calls here but + * it does not cause any harm since the resulting time when + * timer will fire and what is in writeout_period_time will be + * roughly the same. + */ + writeout_period_time = wp_next_time(jiffies); + mod_timer(&writeout_period_timer, writeout_period_time); + } } void bdi_writeout_inc(struct backing_dev_info *bdi) @@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + fprop_fraction_percpu(&writeout_completions, &bdi->completions, numerator, denominator); } /* + * On idle system, we can be called long after we scheduled because we use + * deferred timers so count with missed periods. + */ +static void writeout_period(unsigned long t) +{ + int miss_periods = (jiffies - writeout_period_time) / + VM_COMPLETIONS_PERIOD_LEN; + + if (fprop_new_period(&writeout_completions, miss_periods + 1)) { + writeout_period_time = wp_next_time(writeout_period_time + + miss_periods * VM_COMPLETIONS_PERIOD_LEN); + mod_timer(&writeout_period_timer, writeout_period_time); + } else { + /* + * Aging has zeroed all fractions. Stop wasting CPU on period + * updates. + */ + writeout_period_time = 0; + } +} + +/* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. @@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); @@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * bdi->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated - * code makes use of task_ratelimit to filter out sigular points and + * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of @@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size - * and filter out the sigular points of balanced_dirty_ratelimit. Which + * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). @@ -1568,6 +1596,7 @@ void writeback_set_ratelimit(void) unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); + global_dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; @@ -1605,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { */ void __init page_writeback_init(void) { - int shift; - writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); - prop_descriptor_init(&vm_completions, shift); + fprop_global_init(&writeout_completions); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 918330f71dba..4a4f9219683f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <linux/ftrace_event.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/migrate.h> #include <linux/page-debug-flags.h> #include <asm/tlbflush.h> @@ -513,10 +514,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. + * triggers coalescing into a block of larger size. * * -- wli */ @@ -749,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#ifdef CONFIG_CMA +/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_page_refcounted(page); + set_pageblock_migratetype(page, MIGRATE_CMA); + __free_pages(page, pageblock_order); + totalram_pages += pageblock_nr_pages; +} +#endif /* * The order of subdivision here is critical for the IO subsystem. @@ -874,11 +893,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ +#else + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#endif + [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ }; /* @@ -973,12 +998,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) { - for (i = 0; i < MIGRATE_TYPES - 1; i++) { + for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) - continue; + break; area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype])) @@ -993,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more * aggressive about taking ownership of free pages + * + * On the other hand, never change migration + * type of MIGRATE_CMA pageblocks nor move CMA + * pages on different free lists. We don't + * want unmovable pages to be allocated from + * MIGRATE_CMA areas. */ - if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled) { - unsigned long pages; + if (!is_migrate_cma(migratetype) && + (unlikely(current_order >= pageblock_order / 2) || + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled)) { + int pages; pages = move_freepages_block(zone, page, start_migratetype); @@ -1015,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) rmv_page_order(page); /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order) + if (current_order >= pageblock_order && + !is_migrate_cma(migratetype)) change_pageblock_range(page, current_order, start_migratetype); - expand(zone, page, order, current_order, area, migratetype); + expand(zone, page, order, current_order, area, + is_migrate_cma(migratetype) + ? migratetype : start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); @@ -1061,17 +1096,17 @@ retry_reserve: return page; } -/* +/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, +static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int i; - + int mt = migratetype, i; + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); @@ -1091,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - set_page_private(page, migratetype); + if (IS_ENABLED(CONFIG_CMA)) { + mt = get_pageblock_migratetype(page); + if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) + mt = migratetype; + } + set_page_private(page, mt); list = &page->lru; } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); @@ -1371,8 +1411,12 @@ int split_free_page(struct page *page) if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; - for (; page < endpage; page += pageblock_nr_pages) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } } return 1 << order; @@ -2086,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } #endif /* CONFIG_COMPACTION */ -/* The really slow allocator path where we enter direct reclaim */ -static inline struct page * -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, + nodemask_t *nodemask) { - struct page *page = NULL; struct reclaim_state reclaim_state; - bool drained = false; + int progress; cond_resched(); @@ -2106,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2114,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + bool drained = false; + + *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + nodemask); if (unlikely(!(*did_some_progress))) return NULL; @@ -4244,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ - if (HPAGE_SHIFT > PAGE_SHIFT) - return HUGETLB_PAGE_ORDER; - - return MAX_ORDER-1; -} - /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +static inline void __init set_pageblock_order(void) { + unsigned int order; + /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 + * This value may be variable depending on boot parameters on IA64 and + * powerpc. */ pageblock_order = order; } @@ -4270,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order) /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config */ -static inline int pageblock_default_order(unsigned int order) +static inline void set_pageblock_order(void) { - return MAX_ORDER-1; } -#define set_pageblock_order(x) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ @@ -4301,11 +4354,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); - + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4355,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(lru) - INIT_LIST_HEAD(&zone->lruvec.lists[lru]); - zone->reclaim_stat.recent_rotated[0] = 0; - zone->reclaim_stat.recent_rotated[1] = 0; - zone->reclaim_stat.recent_scanned[0] = 0; - zone->reclaim_stat.recent_scanned[1] = 0; + lruvec_init(&zone->lruvec, zone); zap_zone_vm_stats(zone); zone->flags = 0; if (!size) continue; - set_pageblock_order(pageblock_default_order()); + set_pageblock_order(); setup_usemap(pgdat, zone, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); @@ -4759,31 +4806,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ - printk("Zone PFN ranges:\n"); + printk("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s ", zone_names[i]); + printk(KERN_CONT " %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) - printk("empty\n"); + printk(KERN_CONT "empty\n"); else - printk("%0#10lx -> %0#10lx\n", - arch_zone_lowest_possible_pfn[i], - arch_zone_highest_possible_pfn[i]); + printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", + arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, + (arch_zone_highest_possible_pfn[i] + << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ - printk("Movable zone start PFN for each node\n"); + printk("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + printk(" Node %d: %#010lx\n", i, + zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early_node_map[] */ - printk("Early memory PFN ranges\n"); + printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); + printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, + start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -4976,14 +5026,7 @@ static void setup_per_zone_lowmem_reserve(void) calculate_totalreserve_pages(); } -/** - * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-{added|removed} - * - * Ensures that the watermark[min,low,high] values for each zone are set - * correctly with respect to min_free_kbytes. - */ -void setup_per_zone_wmarks(void) +static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -5030,6 +5073,11 @@ void setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + + zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); + zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); + zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5038,6 +5086,20 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + mutex_lock(&zonelists_mutex); + __setup_per_zone_wmarks(); + mutex_unlock(&zonelists_mutex); +} + /* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance @@ -5242,9 +5304,10 @@ void *__init alloc_large_system_hash(const char *tablename, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, - unsigned long limit) + unsigned long low_limit, + unsigned long high_limit) { - unsigned long long max = limit; + unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; @@ -5282,6 +5345,8 @@ void *__init alloc_large_system_hash(const char *tablename, } max = min(max, 0x80000000ULL); + if (numentries < low_limit) + numentries = low_limit; if (numentries > max) numentries = max; @@ -5412,14 +5477,16 @@ static int __count_immobile_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; + int mt; + /* * For avoiding noise data, lru_add_drain_all() should be called * If ZONE_MOVABLE, the zone never contains immobile pages */ if (zone_idx(zone) == ZONE_MOVABLE) return true; - - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + mt = get_pageblock_migratetype(page); + if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) return true; pfn = page_to_pfn(page); @@ -5536,7 +5603,7 @@ out: return ret; } -void unset_migratetype_isolate(struct page *page) +void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags; @@ -5544,12 +5611,264 @@ void unset_migratetype_isolate(struct page *page) spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); + move_freepages_block(zone, page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn) +{ + return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) - 1); +} + +static unsigned long pfn_max_align_up(unsigned long pfn) +{ + return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages)); +} + +static struct page * +__alloc_contig_migrate_alloc(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} + +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + migrate_prep_local(); + + while (pfn < end || !list_empty(&cc.migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc.migratepages)) { + cc.nr_migratepages = 0; + pfn = isolate_migratepages_range(cc.zone, &cc, + pfn, end); + if (!pfn) { + ret = -EINTR; + break; + } + tries = 0; + } else if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + break; + } + + ret = migrate_pages(&cc.migratepages, + __alloc_contig_migrate_alloc, + 0, false, MIGRATE_SYNC); + } + + putback_lru_pages(&cc.migratepages); + return ret > 0 ? 0 : ret; +} + +/* + * Update zone's cma pages counter used for watermark level calculation. + */ +static inline void __update_cma_watermarks(struct zone *zone, int count) +{ + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); + zone->min_cma_pages += count; + spin_unlock_irqrestore(&zone->lock, flags); + setup_per_zone_wmarks(); +} + +/* + * Trigger memory pressure bump to reclaim some pages in order to be able to + * allocate 'count' pages in single page units. Does similar work as + *__alloc_pages_slowpath() function. + */ +static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zonelist *zonelist = node_zonelist(0, gfp_mask); + int did_some_progress = 0; + int order = 1; + + /* + * Increase level of watermarks to force kswapd do his job + * to stabilise at new watermark level. + */ + __update_cma_watermarks(zone, count); + + /* Obey watermarks as if the page was being allocated */ + while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); + + did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + NULL); + if (!did_some_progress) { + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order, NULL, false); + } + } + + /* Restore original watermark levels. */ + __update_cma_watermarks(zone, -count); + + return count; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) +{ + struct zone *zone = page_zone(pfn_to_page(start)); + unsigned long outer_start, outer_end; + int ret = 0, order; + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, we align the range to biggest of the two pages so + * that page allocator won't try to merge buddies from + * different pageblocks and change MIGRATE_ISOLATE to some + * other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. + */ + + ret = start_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + if (ret) + goto done; + + ret = __alloc_contig_migrate_range(start, end); + if (ret) + goto done; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ + + lru_add_drain_all(); + drain_all_pages(); + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + ret = -EBUSY; + goto done; + } + outer_start &= ~0UL << order; + } + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end)) { + pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", + outer_start, end); + ret = -EBUSY; + goto done; + } + + /* + * Reclaim enough pages to make sure that contiguous allocation + * will not starve the system. + */ + __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); + + /* Grab isolated pages from freelists. */ + outer_end = isolate_freepages_range(outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + return ret; +} + +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + for (; nr_pages--; ++pfn) + __free_page(pfn_to_page(pfn)); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. @@ -5618,7 +5937,7 @@ bool is_free_buddy_page(struct page *page) } #endif -static struct trace_print_flags pageflag_names[] = { +static const struct trace_print_flags pageflag_names[] = { {1UL << PG_locked, "locked" }, {1UL << PG_error, "error" }, {1UL << PG_referenced, "referenced" }, @@ -5653,7 +5972,9 @@ static struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_MEMORY_FAILURE {1UL << PG_hwpoison, "hwpoison" }, #endif - {-1UL, NULL }, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, +#endif }; static void dump_page_flags(unsigned long flags) @@ -5662,12 +5983,14 @@ static void dump_page_flags(unsigned long flags) unsigned long mask; int i; + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + printk(KERN_ALERT "page flags: %#lx(", flags); /* remove zone id */ flags &= (1UL << NR_PAGEFLAGS) - 1; - for (i = 0; pageflag_names[i].name && flags; i++) { + for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { mask = pageflag_names[i].mask; if ((flags & mask) != mask) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..eb750f851395 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @end: swap entry to be cmpxchged + * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * @@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into - * @mem: mem_cgroup to be recorded + * @id: mem_cgroup to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/swapops.h> #include <linux/writeback.h> +#include <linux/frontswap.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + if (frontswap_store(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4ae42bb40892..c9f04774f2b8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -24,6 +24,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * to be MIGRATE_ISOLATE. * @start_pfn: The lower PFN of the range to be isolated. * @end_pfn: The upper PFN of the range to be isolated. + * @migratetype: migrate type to set in error recovery. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -32,8 +33,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * start_pfn/end_pfn must be aligned to pageblock_order. * Returns 0 on success and -EBUSY if any part of range cannot be isolated. */ -int -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype) { unsigned long pfn; unsigned long undo_pfn; @@ -56,7 +57,7 @@ undo: for (pfn = start_pfn; pfn < undo_pfn; pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn)); + unset_migratetype_isolate(pfn_to_page(pfn), migratetype); return -EBUSY; } @@ -64,8 +65,8 @@ undo: /* * Make isolated pages available again. */ -int -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype) { unsigned long pfn; struct page *page; @@ -77,7 +78,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) page = __first_valid_page(pfn, pageblock_nr_pages); if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) continue; - unset_migratetype_isolate(page); + unset_migratetype_isolate(page, migratetype); } return 0; } @@ -86,7 +87,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 1 if all pages in the range is isolated. + * Returns 1 if all pages in the range are isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..6c118d012bb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, /** * walk_page_range - walk a memory map's page tables with a callback - * @mm: memory map to walk * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -360,7 +360,6 @@ err_free: * @chunk: chunk to depopulate * @off: offset to the area to depopulate * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5a74fea182f1..74c0ddaa6fa0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd = pmd_mksplitting(*pmdp); VM_BUG_ON(address & ~HPAGE_PMD_MASK); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index c20ff48994c2..926b46649749 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid, /* Check iovecs */ if (vm_write) rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, - iovstack_l, &iov_l, 1); + iovstack_l, &iov_l); else rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, - iovstack_l, &iov_l, 1); + iovstack_l, &iov_l); if (rc <= 0) goto free_iovecs; - rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, - iovstack_r, &iov_r, 0); + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, + iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; @@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid, if (vm_write) rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, iovstack_l, - &iov_l, 1); + &iov_l); else rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, iovstack_l, - &iov_l, 1); + &iov_l); if (rc <= 0) goto free_iovecs; - rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, + rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, - &iov_r, 0); + &iov_r); if (rc <= 0) goto free_iovecs; diff --git a/mm/readahead.c b/mm/readahead.c index cbcbb02f3e28..ea8f8fa21649 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,8 @@ #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> #include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/file.h> /* * Initialise a struct file's readahead state. Assumes that the caller has @@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping, ondemand_readahead(mapping, ra, filp, true, offset, req_size); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + pgoff_t index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + force_page_cache_readahead(mapping, filp, index, nr); + return 0; +} + +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + struct address_space *mapping = file->f_mapping; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, file, start, len); + } + fput(file); + } + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 5b5ad584ffb7..0f3b7cda2a24 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; - (*mapcount)--; if (referenced) diff --git a/mm/shmem.c b/mm/shmem.c index f99ff3e50bd6..c15b998e5a86 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> +#include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> @@ -83,12 +84,25 @@ struct shmem_xattr { char value[0]; }; +/* + * shmem_fallocate and shmem_writepage communicate via inode->i_private + * (with i_mutex making sure that it has only one user at a time): + * we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ +}; + /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; #ifdef CONFIG_TMPFS @@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void) } #endif +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); @@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, } /* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + void *item; + + rcu_read_lock(); + item = radix_tree_lookup(&mapping->page_tree, index); + rcu_read_unlock(); + return item == swp_to_radix_entry(swap); +} + +/* * Like add_to_page_cache_locked, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void *expected) { - int error = 0; + int error; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); if (!expected) - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_insert(&mapping->page_tree, index, page); + else + error = shmem_radix_tree_replace(mapping, index, expected, + page); if (!error) { - page_cache_get(page); - page->mapping = mapping; - page->index = index; - - spin_lock_irq(&mapping->tree_lock); - if (!expected) - error = radix_tree_insert(&mapping->page_tree, - index, page); - else - error = shmem_radix_tree_replace(mapping, index, - expected, page); - if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); - } - if (!expected) - radix_tree_preload_end(); + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - if (error) - mem_cgroup_uncharge_cache_page(page); return error; } @@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping) /* * Remove range of pages and swap entries from radix tree, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; + unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; pgoff_t index; int i; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + if (lend == -1) + end = -1; /* unsigned, so actually very big */ pagevec_init(&pvec, 0); index = start; - while (index <= end) { + while (index < end) { pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) break; @@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; @@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (!trylock_page(page)) continue; - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index++; } - if (partial) { + if (partial_start) { struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - zero_user_segment(page, partial, PAGE_CACHE_SIZE); + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + top = partial_end; + partial_end = 0; + } + zero_user_segment(page, partial_start, top); set_page_dirty(page); unlock_page(page); page_cache_release(page); } } + if (partial_end) { + struct page *page = NULL; + shmem_getpage(inode, end, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, 0, partial_end); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { - if (index == start) + if (index == start || unfalloc) break; index = start; continue; } - if (index == start && indices[0] > end) { + if ((index == start || unfalloc) && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; } lock_page(page); - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); spin_unlock(&info->lock); +} +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode) } BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); - end_writeback(inode); + clear_inode(inode); } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page *page) + swp_entry_t swap, struct page **pagep) { struct address_space *mapping = info->vfs_inode.i_mapping; void *radswap; pgoff_t index; - int error; + gfp_t gfp; + int error = 0; radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); @@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); + gfp = mapping_gfp_mask(mapping); + if (shmem_should_replace_page(*pagep, gfp)) { + mutex_unlock(&shmem_swaplist_mutex); + error = shmem_replace_page(pagep, gfp, info, index); + mutex_lock(&shmem_swaplist_mutex); + /* + * We needed to drop mutex to make that restrictive page + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). + */ + if (!page_swapcount(*pagep)) + error = -ENOENT; + } + /* * We rely on shmem_swaplist_mutex, not only to protect the swaplist, * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = shmem_add_to_page_cache(page, mapping, index, + if (!error) + error = shmem_add_to_page_cache(*pagep, mapping, index, GFP_NOWAIT, radswap); - /* which does mem_cgroup_uncharge_cache_page on error */ - if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which * only does trylock page: if we raced, best clean up here. */ - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(*pagep); + set_page_dirty(*pagep); if (!error) { spin_lock(&info->lock); info->swapped--; @@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; - int error; + int error = 0; + + /* + * There's a faint possibility that swap page was replaced before + * caller locked it: caller will come back later with the right page. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) + goto out; /* * Charge page using GFP_KERNEL while we can wait, before taking @@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, page); + found = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); @@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) } mutex_unlock(&shmem_swaplist_mutex); - if (!found) - mem_cgroup_uncharge_cache_page(page); if (found < 0) error = found; out: @@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ goto redirty; } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated page arriving here is now to initialize it and write it. + * + * That's okay for a page already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this page in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the page, and let shmem_fallocate() quit when too many. + */ + if (!PageUptodate(page)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + swap = get_swap_page(); if (!swap.val) goto redirty; @@ -856,6 +979,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #endif /* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +{ + return page_zonenum(page) > gfp_zone(gfp); +} + +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct page *oldpage, *newpage; + struct address_space *swap_mapping; + pgoff_t swap_index; + int error; + + oldpage = *pagep; + swap_index = page_private(oldpage); + swap_mapping = page_mapping(oldpage); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + newpage = shmem_alloc_page(gfp, info, index); + if (!newpage) + return -ENOMEM; + + page_cache_get(newpage); + copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); + + __set_page_locked(newpage); + SetPageUptodate(newpage); + SetPageSwapBacked(newpage); + set_page_private(newpage, swap_index); + SetPageSwapCache(newpage); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + spin_lock_irq(&swap_mapping->tree_lock); + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, + newpage); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } + spin_unlock_irq(&swap_mapping->tree_lock); + + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + *pagep = newpage; + } + + ClearPageSwapCache(oldpage); + set_page_private(oldpage, 0); + + unlock_page(oldpage); + page_cache_release(oldpage); + page_cache_release(oldpage); + return error; +} + +/* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the @@ -872,6 +1078,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, swp_entry_t swap; int error; int once = 0; + int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; @@ -883,19 +1090,21 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto failed; } + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) + goto clear; + unlock_page(page); + page_cache_release(page); + page = NULL; + } if (page || (sgp == SGP_READ && !swap.val)) { - /* - * Once we can get the page lock, it must be uptodate: - * if there were an error in reading back from swap, - * the page would not be inserted into the filecache. - */ - BUG_ON(page && !PageUptodate(page)); *pagep = page; return 0; } @@ -923,26 +1132,31 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; /* try again */ + goto unlock; + } if (!PageUptodate(page)) { error = -EIO; goto failed; } wait_on_page_writeback(page); - /* Someone may have already done it for us */ - if (page->mapping) { - if (page->mapping == mapping && - page->index == index) - goto done; - error = -EEXIST; - goto failed; + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; } error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) + if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); + /* We already confirmed swap, and make no allocation */ + VM_BUG_ON(error); + } if (error) goto failed; @@ -979,11 +1193,18 @@ repeat: __set_page_locked(page); error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); if (error) goto decused; + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + radix_tree_preload_end(); + } + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto decused; + } lru_cache_add_anon(page); spin_lock(&info->lock); @@ -991,19 +1212,36 @@ repeat: inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock(&info->lock); + alloced = true; - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } if (sgp == SGP_DIRTY) set_page_dirty(page); } -done: + /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto trunc; + if (alloced) + goto trunc; + else + goto failed; } *pagep = page; return 0; @@ -1012,6 +1250,7 @@ done: * Error recovery. */ trunc: + info = SHMEM_I(inode); ClearPageDirty(page); delete_from_page_cache(page); spin_lock(&info->lock); @@ -1019,19 +1258,16 @@ trunc: inode->i_blocks -= BLOCKS_PER_PAGE; spin_unlock(&info->lock); decused: + sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL) { - struct page *test = find_get_page(mapping, index); - if (test && !radix_tree_exceptional_entry(test)) - page_cache_release(test); - /* Have another try if the entry has changed */ - if (test != swp_to_radix_entry(swap)) - error = -EEXIST; - } + if (swap.val && error != -EINVAL && + !shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: if (page) { unlock_page(page); page_cache_release(page); @@ -1043,7 +1279,7 @@ failed: spin_unlock(&info->lock); goto repeat; } - if (error == -EEXIST) + if (error == -EEXIST) /* from above or from radix_tree_insert */ goto repeat; return error; } @@ -1204,6 +1440,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + } set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1365,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1453,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; @@ -1462,6 +1707,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_falloc shmem_falloc; + pgoff_t start, index, end; + int error; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + error = 0; + goto out; + } + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + for (index = start; index < end; index++) { + struct page *page; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; + else + error = shmem_getpage(inode, index, &page, SGP_FALLOC, + NULL); + if (error) { + /* Remove the !PageUptodate pages we added */ + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, true); + goto undone; + } + + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + shmem_falloc.next++; + if (!PageUptodate(page)) + shmem_falloc.nr_falloced++; + + /* + * If !PageUptodate, leave it that way so that freeable pages + * can be recognized if we need to rollback on error later. + * But set_page_dirty so that memory pressure will swap rather + * than free the pages we are allocating (and SGP_CACHE pages + * might still be clean: we now need to mark those dirty too). + */ + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -1531,7 +1877,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return shmem_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -1665,6 +2011,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); kunmap_atomic(kaddr); + SetPageUptodate(page); set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -2033,11 +2380,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, return dentry; } -static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, - int connectable) +static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, + struct inode *parent) { - struct inode *inode = dentry->d_inode; - if (*len < 3) { *len = 3; return 255; @@ -2075,6 +2420,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, bool remount) { char *this_char, *value, *rest; + uid_t uid; + gid_t gid; while (options != NULL) { this_char = options; @@ -2134,15 +2481,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, } else if (!strcmp(this_char,"uid")) { if (remount) continue; - sbinfo->uid = simple_strtoul(value, &rest, 0); + uid = simple_strtoul(value, &rest, 0); if (*rest) goto bad_val; + sbinfo->uid = make_kuid(current_user_ns(), uid); + if (!uid_valid(sbinfo->uid)) + goto bad_val; } else if (!strcmp(this_char,"gid")) { if (remount) continue; - sbinfo->gid = simple_strtoul(value, &rest, 0); + gid = simple_strtoul(value, &rest, 0); if (*rest) goto bad_val; + sbinfo->gid = make_kgid(current_user_ns(), gid); + if (!gid_valid(sbinfo->gid)) + goto bad_val; } else if (!strcmp(this_char,"mpol")) { if (mpol_parse_str(value, &sbinfo->mpol, 1)) goto bad_val; @@ -2210,10 +2563,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); - if (sbinfo->uid != 0) - seq_printf(seq, ",uid=%u", sbinfo->uid); - if (sbinfo->gid != 0) - seq_printf(seq, ",gid=%u", sbinfo->gid); + if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, sbinfo->uid)); + if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, sbinfo->gid)); shmem_show_mpol(seq, sbinfo->mpol); return 0; } @@ -2260,6 +2615,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) } } sb->s_export_op = &shmem_export_ops; + sb->s_flags |= MS_NOSEC; #else sb->s_flags |= MS_NOUSER; #endif @@ -2362,12 +2718,12 @@ static const struct file_operations shmem_file_operations = { .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = shmem_fallocate, #endif }; static const struct inode_operations shmem_inode_operations = { .setattr = shmem_setattr, - .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, diff --git a/mm/slab.c b/mm/slab.c index e901a36e2520..1fcf3ac94b6c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -68,7 +68,7 @@ * Further notes from the original documentation: * * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the mutex 'cache_chain_mutex'. + * The global cache-chain is protected by the mutex 'slab_mutex'. * The sem is only needed when accessing/extending the cache-chain, which * can never happen inside an interrupt (kmem_cache_create(), * kmem_cache_shrink() and kmem_cache_reap()). @@ -87,6 +87,7 @@ */ #include <linux/slab.h> +#include "slab.h" #include <linux/mm.h> #include <linux/poison.h> #include <linux/swap.h> @@ -424,8 +425,8 @@ static void kmem_list3_init(struct kmem_list3 *parent) * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: * redzone word. * cachep->obj_offset: The real object. - * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->size - 1* BYTES_PER_WORD: last caller address * [BYTES_PER_WORD long] */ static int obj_offset(struct kmem_cache *cachep) @@ -433,11 +434,6 @@ static int obj_offset(struct kmem_cache *cachep) return cachep->obj_offset; } -static int obj_size(struct kmem_cache *cachep) -{ - return cachep->obj_size; -} - static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); @@ -449,23 +445,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); if (cachep->flags & SLAB_STORE_USER) - return (unsigned long long *)(objp + cachep->buffer_size - + return (unsigned long long *)(objp + cachep->size - sizeof(unsigned long long) - REDZONE_ALIGN); - return (unsigned long long *) (objp + cachep->buffer_size - + return (unsigned long long *) (objp + cachep->size - sizeof(unsigned long long)); } static void **dbg_userword(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_STORE_USER)); - return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); + return (void **)(objp + cachep->size - BYTES_PER_WORD); } #else #define obj_offset(x) 0 -#define obj_size(cachep) (cachep->buffer_size) #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) @@ -475,7 +470,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #ifdef CONFIG_TRACING size_t slab_buffer_size(struct kmem_cache *cachep) { - return cachep->buffer_size; + return cachep->size; } EXPORT_SYMBOL(slab_buffer_size); #endif @@ -489,56 +484,37 @@ EXPORT_SYMBOL(slab_buffer_size); static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -/* - * Functions for storing/retrieving the cachep and or slab from the page - * allocator. These are used to find the slab an obj belongs to. With kfree(), - * these are used to find the cache which an obj belongs to. - */ -static inline void page_set_cache(struct page *page, struct kmem_cache *cache) -{ - page->lru.next = (struct list_head *)cache; -} - static inline struct kmem_cache *page_get_cache(struct page *page) { page = compound_head(page); BUG_ON(!PageSlab(page)); - return (struct kmem_cache *)page->lru.next; -} - -static inline void page_set_slab(struct page *page, struct slab *slab) -{ - page->lru.prev = (struct list_head *)slab; -} - -static inline struct slab *page_get_slab(struct page *page) -{ - BUG_ON(!PageSlab(page)); - return (struct slab *)page->lru.prev; + return page->slab_cache; } static inline struct kmem_cache *virt_to_cache(const void *obj) { struct page *page = virt_to_head_page(obj); - return page_get_cache(page); + return page->slab_cache; } static inline struct slab *virt_to_slab(const void *obj) { struct page *page = virt_to_head_page(obj); - return page_get_slab(page); + + VM_BUG_ON(!PageSlab(page)); + return page->slab_page; } static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, unsigned int idx) { - return slab->s_mem + cache->buffer_size * idx; + return slab->s_mem + cache->size * idx; } /* - * We want to avoid an expensive divide : (offset / cache->buffer_size) - * Using the fact that buffer_size is a constant for a particular cache, - * we can replace (offset / cache->buffer_size) by + * We want to avoid an expensive divide : (offset / cache->size) + * Using the fact that size is a constant for a particular cache, + * we can replace (offset / cache->size) by * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, @@ -584,33 +560,12 @@ static struct kmem_cache cache_cache = { .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, - .buffer_size = sizeof(struct kmem_cache), + .size = sizeof(struct kmem_cache), .name = "kmem_cache", }; #define BAD_ALIEN_MAGIC 0x01020304ul -/* - * chicken and egg problem: delay the per-cpu array allocation - * until the general caches are up. - */ -static enum { - NONE, - PARTIAL_AC, - PARTIAL_L3, - EARLY, - LATE, - FULL -} g_cpucache_up; - -/* - * used by boot code to determine if it can use slab based allocator - */ -int slab_is_available(void) -{ - return g_cpucache_up >= EARLY; -} - #ifdef CONFIG_LOCKDEP /* @@ -676,7 +631,7 @@ static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; - if (g_cpucache_up < LATE) + if (slab_state < UP) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { @@ -716,12 +671,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) } #endif -/* - * Guard access to the cache-chain. - */ -static DEFINE_MUTEX(cache_chain_mutex); -static struct list_head cache_chain; - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -1145,7 +1094,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * When hotplugging memory or a cpu, existing nodelists are not replaced if * already in use. * - * Must hold cache_chain_mutex. + * Must hold slab_mutex. */ static int init_cache_nodelists_node(int node) { @@ -1153,7 +1102,7 @@ static int init_cache_nodelists_node(int node) struct kmem_list3 *l3; const int memsize = sizeof(struct kmem_list3); - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { /* * Set up the size64 kmemlist for cpu before we can * begin anything. Make sure some other cpu on this @@ -1169,7 +1118,7 @@ static int init_cache_nodelists_node(int node) /* * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient + * go. slab_mutex is sufficient * protection here. */ cachep->nodelists[node] = l3; @@ -1191,7 +1140,7 @@ static void __cpuinit cpuup_canceled(long cpu) int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; @@ -1241,7 +1190,7 @@ free_array_cache: * the respective cache's slabs, now we can go ahead and * shrink each nodelist to its limit. */ - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { l3 = cachep->nodelists[node]; if (!l3) continue; @@ -1270,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu) * Now we can go ahead with allocating the shared arrays and * array caches */ - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; struct array_cache **alien = NULL; @@ -1338,9 +1287,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); err = cpuup_prepare(cpu); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -1350,7 +1299,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* - * Shutdown cache reaper. Note that the cache_chain_mutex is + * Shutdown cache reaper. Note that the slab_mutex is * held so that if cache_reap() is invoked it cannot do * anything expensive but will only modify reap_work * and reschedule the timer. @@ -1377,9 +1326,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, #endif case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); cpuup_canceled(cpu); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); break; } return notifier_from_errno(err); @@ -1395,14 +1344,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { * Returns -EBUSY if all objects cannot be drained so that the node is not * removed. * - * Must hold cache_chain_mutex. + * Must hold slab_mutex. */ static int __meminit drain_cache_nodelists_node(int node) { struct kmem_cache *cachep; int ret = 0; - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { struct kmem_list3 *l3; l3 = cachep->nodelists[node]; @@ -1433,14 +1382,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self, switch (action) { case MEM_GOING_ONLINE: - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); ret = init_cache_nodelists_node(nid); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); break; case MEM_GOING_OFFLINE: - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); ret = drain_cache_nodelists_node(nid); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); break; case MEM_ONLINE: case MEM_OFFLINE: @@ -1544,8 +1493,8 @@ void __init kmem_cache_init(void) node = numa_mem_id(); /* 1) create the cache_cache */ - INIT_LIST_HEAD(&cache_chain); - list_add(&cache_cache.next, &cache_chain); + INIT_LIST_HEAD(&slab_caches); + list_add(&cache_cache.list, &slab_caches); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; @@ -1553,18 +1502,16 @@ void __init kmem_cache_init(void) /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + nr_node_ids * sizeof(struct kmem_list3 *); -#if DEBUG - cache_cache.obj_size = cache_cache.buffer_size; -#endif - cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, + cache_cache.object_size = cache_cache.size; + cache_cache.size = ALIGN(cache_cache.size, cache_line_size()); cache_cache.reciprocal_buffer_size = - reciprocal_value(cache_cache.buffer_size); + reciprocal_value(cache_cache.size); for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, cache_cache.buffer_size, + cache_estimate(order, cache_cache.size, cache_line_size(), 0, &left_over, &cache_cache.num); if (cache_cache.num) break; @@ -1585,7 +1532,7 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, + sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, @@ -1593,7 +1540,7 @@ void __init kmem_cache_init(void) if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = - kmem_cache_create(names[INDEX_L3].name, + __kmem_cache_create(names[INDEX_L3].name, sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, @@ -1611,14 +1558,14 @@ void __init kmem_cache_init(void) * allow tighter packing of the smaller caches. */ if (!sizes->cs_cachep) { - sizes->cs_cachep = kmem_cache_create(names->name, + sizes->cs_cachep = __kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); } #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_create( + sizes->cs_dmacachep = __kmem_cache_create( names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, @@ -1676,27 +1623,27 @@ void __init kmem_cache_init(void) } } - g_cpucache_up = EARLY; + slab_state = UP; } void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - g_cpucache_up = LATE; + slab_state = UP; /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); /* 6) resize the head arrays to their final sizes */ - mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) + mutex_lock(&slab_mutex); + list_for_each_entry(cachep, &slab_caches, list) if (enable_cpucache(cachep, GFP_NOWAIT)) BUG(); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); /* Done! */ - g_cpucache_up = FULL; + slab_state = FULL; /* * Register a cpu startup notifier callback that initializes @@ -1727,6 +1674,9 @@ static int __init cpucache_init(void) */ for_each_online_cpu(cpu) start_cpu_timer(cpu); + + /* Done! */ + slab_state = FULL; return 0; } __initcall(cpucache_init); @@ -1743,7 +1693,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", nodeid, gfpflags); printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", - cachep->name, cachep->buffer_size, cachep->gfporder); + cachep->name, cachep->size, cachep->gfporder); for_each_online_node(node) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; @@ -1798,7 +1748,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) flags |= __GFP_COMP; #endif - flags |= cachep->gfpflags; + flags |= cachep->allocflags; if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; @@ -1874,7 +1824,7 @@ static void kmem_rcu_free(struct rcu_head *head) static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, unsigned long caller) { - int size = obj_size(cachep); + int size = cachep->object_size; addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; @@ -1906,7 +1856,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) { - int size = obj_size(cachep); + int size = cachep->object_size; addr = &((char *)addr)[obj_offset(cachep)]; memset(addr, val, size); @@ -1966,7 +1916,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) printk("\n"); } realobj = (char *)objp + obj_offset(cachep); - size = obj_size(cachep); + size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { int limit; limit = 16; @@ -1983,7 +1933,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int lines = 0; realobj = (char *)objp + obj_offset(cachep); - size = obj_size(cachep); + size = cachep->object_size; for (i = 0; i < size; i++) { char exp = POISON_FREE; @@ -2047,10 +1997,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->buffer_size % PAGE_SIZE == 0 && + if (cachep->size % PAGE_SIZE == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 1); + cachep->size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -2194,10 +2144,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { - if (g_cpucache_up == FULL) + if (slab_state >= FULL) return enable_cpucache(cachep, gfp); - if (g_cpucache_up == NONE) { + if (slab_state == DOWN) { /* * Note: the first kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of @@ -2212,16 +2162,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) */ set_up_list3s(cachep, SIZE_AC); if (INDEX_AC == INDEX_L3) - g_cpucache_up = PARTIAL_L3; + slab_state = PARTIAL_L3; else - g_cpucache_up = PARTIAL_AC; + slab_state = PARTIAL_ARRAYCACHE; } else { cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); - if (g_cpucache_up == PARTIAL_AC) { + if (slab_state == PARTIAL_ARRAYCACHE) { set_up_list3s(cachep, SIZE_L3); - g_cpucache_up = PARTIAL_L3; + slab_state = PARTIAL_L3; } else { int node; for_each_online_node(node) { @@ -2247,7 +2197,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } /** - * kmem_cache_create - Create a cache. + * __kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. @@ -2274,59 +2224,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * as davem. */ struct kmem_cache * -kmem_cache_create (const char *name, size_t size, size_t align, +__kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; - struct kmem_cache *cachep = NULL, *pc; + struct kmem_cache *cachep = NULL; gfp_t gfp; - /* - * Sanity checks... these are all serious usage bugs. - */ - if (!name || in_interrupt() || (size < BYTES_PER_WORD) || - size > KMALLOC_MAX_SIZE) { - printk(KERN_ERR "%s: Early error in slab %s\n", __func__, - name); - BUG(); - } - - /* - * We use cache_chain_mutex to ensure a consistent view of - * cpu_online_mask as well. Please see cpuup_callback - */ - if (slab_is_available()) { - get_online_cpus(); - mutex_lock(&cache_chain_mutex); - } - - list_for_each_entry(pc, &cache_chain, next) { - char tmp; - int res; - - /* - * This happens when the module gets unloaded and doesn't - * destroy its slab cache and no-one else reuses the vmalloc - * area of the module. Print a warning. - */ - res = probe_kernel_address(pc->name, tmp); - if (res) { - printk(KERN_ERR - "SLAB: cache with size %d has lost its name\n", - pc->buffer_size); - continue; - } - - if (!strcmp(pc->name, name)) { - printk(KERN_ERR - "kmem_cache_create: duplicate cache %s\n", name); - dump_stack(); - goto oops; - } - } - #if DEBUG - WARN_ON(strchr(name, ' ')); /* It confuses parsers */ #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -2415,11 +2320,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* Get cache's description obj. */ cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) - goto oops; + return NULL; cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; + cachep->object_size = size; + cachep->align = align; #if DEBUG - cachep->obj_size = size; /* * Both debugging options require word-alignment which is calculated @@ -2442,7 +2348,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { + && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } @@ -2471,8 +2377,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); - cachep = NULL; - goto oops; + return NULL; } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align); @@ -2508,10 +2413,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; - cachep->gfpflags = 0; + cachep->allocflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) - cachep->gfpflags |= GFP_DMA; - cachep->buffer_size = size; + cachep->allocflags |= GFP_DMA; + cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { @@ -2530,8 +2435,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); - cachep = NULL; - goto oops; + return NULL; } if (flags & SLAB_DEBUG_OBJECTS) { @@ -2545,18 +2449,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, } /* cache setup completed, link it into the list */ - list_add(&cachep->next, &cache_chain); -oops: - if (!cachep && (flags & SLAB_PANIC)) - panic("kmem_cache_create(): failed to create slab `%s'\n", - name); - if (slab_is_available()) { - mutex_unlock(&cache_chain_mutex); - put_online_cpus(); - } + list_add(&cachep->list, &slab_caches); return cachep; } -EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) @@ -2671,7 +2566,7 @@ out: return nr_freed; } -/* Called with cache_chain_mutex held to protect against cpu hotplug */ +/* Called with slab_mutex held to protect against cpu hotplug */ static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; @@ -2706,9 +2601,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep) BUG_ON(!cachep || in_interrupt()); get_online_cpus(); - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); ret = __cache_shrink(cachep); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); put_online_cpus(); return ret; } @@ -2736,15 +2631,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep) /* Find the cache in the chain of caches. */ get_online_cpus(); - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); /* * the chain is never empty, cache_cache is never destroyed */ - list_del(&cachep->next); + list_del(&cachep->list); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); - list_add(&cachep->next, &cache_chain); - mutex_unlock(&cache_chain_mutex); + list_add(&cachep->list, &slab_caches); + mutex_unlock(&slab_mutex); put_online_cpus(); return; } @@ -2753,7 +2648,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) rcu_barrier(); __kmem_cache_destroy(cachep); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2840,10 +2735,10 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->buffer_size % PAGE_SIZE) == 0 && + if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 0); + cachep->size / PAGE_SIZE, 0); #else if (cachep->ctor) cachep->ctor(objp); @@ -2857,9 +2752,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { if (CONFIG_ZONE_DMA_FLAG) { if (flags & GFP_DMA) - BUG_ON(!(cachep->gfpflags & GFP_DMA)); + BUG_ON(!(cachep->allocflags & GFP_DMA)); else - BUG_ON(cachep->gfpflags & GFP_DMA); + BUG_ON(cachep->allocflags & GFP_DMA); } } @@ -2918,8 +2813,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, nr_pages <<= cache->gfporder; do { - page_set_cache(page, cache); - page_set_slab(page, slab); + page->slab_cache = cache; + page->slab_page = slab; page++; } while (--nr_pages); } @@ -3057,7 +2952,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_head_page(objp); - slabp = page_get_slab(page); + slabp = page->slab_page; if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); @@ -3077,10 +2972,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { + if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, (unsigned long)caller); kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 0); + cachep->size / PAGE_SIZE, 0); } else { poison_obj(cachep, objp, POISON_FREE); } @@ -3230,9 +3125,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 1); + cachep->size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -3261,8 +3156,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, struct slab *slabp; unsigned objnr; - slabp = page_get_slab(virt_to_head_page(objp)); - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + slabp = virt_to_head_page(objp)->slab_page; + objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; } #endif @@ -3285,7 +3180,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) if (cachep == &cache_cache) return false; - return should_failslab(obj_size(cachep), flags, cachep->flags); + return should_failslab(cachep->object_size, flags, cachep->flags); } static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) @@ -3336,7 +3231,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(current->mempolicy); + nid_alloc = slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3368,7 +3263,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); + zonelist = node_zonelist(slab_node(), flags); retry: /* @@ -3545,14 +3440,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, + kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, flags); if (likely(ptr)) - kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); + kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); if (unlikely((flags & __GFP_ZERO) && ptr)) - memset(ptr, 0, obj_size(cachep)); + memset(ptr, 0, cachep->object_size); return ptr; } @@ -3607,15 +3502,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, + kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, flags); prefetchw(objp); if (likely(objp)) - kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); + kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); if (unlikely((flags & __GFP_ZERO) && objp)) - memset(objp, 0, obj_size(cachep)); + memset(objp, 0, cachep->object_size); return objp; } @@ -3731,7 +3626,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + kmemcheck_slab_free(cachep, objp, cachep->object_size); /* * Skip calling cache_free_alien() when the platform is not numa. @@ -3766,7 +3661,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); trace_kmem_cache_alloc(_RET_IP_, ret, - obj_size(cachep), cachep->buffer_size, flags); + cachep->object_size, cachep->size, flags); return ret; } @@ -3794,7 +3689,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) __builtin_return_address(0)); trace_kmem_cache_alloc_node(_RET_IP_, ret, - obj_size(cachep), cachep->buffer_size, + cachep->object_size, cachep->size, flags, nodeid); return ret; @@ -3876,7 +3771,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, ret = __cache_alloc(cachep, flags, caller); trace_kmalloc((unsigned long) caller, ret, - size, cachep->buffer_size, flags); + size, cachep->size, flags); return ret; } @@ -3916,9 +3811,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) unsigned long flags; local_irq_save(flags); - debug_check_no_locks_freed(objp, obj_size(cachep)); + debug_check_no_locks_freed(objp, cachep->object_size); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, obj_size(cachep)); + debug_check_no_obj_freed(objp, cachep->object_size); __cache_free(cachep, objp, __builtin_return_address(0)); local_irq_restore(flags); @@ -3947,8 +3842,9 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); - debug_check_no_locks_freed(objp, obj_size(c)); - debug_check_no_obj_freed(objp, obj_size(c)); + debug_check_no_locks_freed(objp, c->object_size); + + debug_check_no_obj_freed(objp, c->object_size); __cache_free(c, (void *)objp, __builtin_return_address(0)); local_irq_restore(flags); } @@ -3956,7 +3852,7 @@ EXPORT_SYMBOL(kfree); unsigned int kmem_cache_size(struct kmem_cache *cachep) { - return obj_size(cachep); + return cachep->object_size; } EXPORT_SYMBOL(kmem_cache_size); @@ -4030,7 +3926,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) return 0; fail: - if (!cachep->next.next) { + if (!cachep->list.next) { /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { @@ -4065,7 +3961,7 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } -/* Always called with the cache_chain_mutex held */ +/* Always called with the slab_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { @@ -4109,7 +4005,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return alloc_kmemlist(cachep, gfp); } -/* Called with cache_chain_mutex held always */ +/* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; @@ -4124,13 +4020,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * The numbers are guessed, we should auto-tune as described by * Bonwick. */ - if (cachep->buffer_size > 131072) + if (cachep->size > 131072) limit = 1; - else if (cachep->buffer_size > PAGE_SIZE) + else if (cachep->size > PAGE_SIZE) limit = 8; - else if (cachep->buffer_size > 1024) + else if (cachep->size > 1024) limit = 24; - else if (cachep->buffer_size > 256) + else if (cachep->size > 256) limit = 54; else limit = 120; @@ -4145,7 +4041,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * to a larger limit. Thus disabled by default. */ shared = 0; - if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) + if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; #if DEBUG @@ -4211,11 +4107,11 @@ static void cache_reap(struct work_struct *w) int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); - if (!mutex_trylock(&cache_chain_mutex)) + if (!mutex_trylock(&slab_mutex)) /* Give up. Setup the next iteration. */ goto out; - list_for_each_entry(searchp, &cache_chain, next) { + list_for_each_entry(searchp, &slab_caches, list) { check_irq_on(); /* @@ -4253,7 +4149,7 @@ next: cond_resched(); } check_irq_on(); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); next_reap_node(); out: /* Set up the next iteration */ @@ -4289,26 +4185,26 @@ static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); if (!n) print_slabinfo_header(m); - return seq_list_start(&cache_chain, *pos); + return seq_list_start(&slab_caches, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &cache_chain, pos); + return seq_list_next(p, &slab_caches, pos); } static void s_stop(struct seq_file *m, void *p) { - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); } static int s_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4364,7 +4260,7 @@ static int s_show(struct seq_file *m, void *p) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - name, active_objs, num_objs, cachep->buffer_size, + name, active_objs, num_objs, cachep->size, cachep->num, (1 << cachep->gfporder)); seq_printf(m, " : tunables %4u %4u %4u", cachep->limit, cachep->batchcount, cachep->shared); @@ -4454,9 +4350,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, return -EINVAL; /* Find the cache in the chain of caches. */ - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); res = -EINVAL; - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { if (!strcmp(cachep->name, kbuf)) { if (limit < 1 || batchcount < 1 || batchcount > limit || shared < 0) { @@ -4469,7 +4365,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, break; } } - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); if (res >= 0) res = count; return res; @@ -4492,8 +4388,8 @@ static const struct file_operations proc_slabinfo_operations = { static void *leaks_start(struct seq_file *m, loff_t *pos) { - mutex_lock(&cache_chain_mutex); - return seq_list_start(&cache_chain, *pos); + mutex_lock(&slab_mutex); + return seq_list_start(&slab_caches, *pos); } static inline int add_caller(unsigned long *n, unsigned long v) @@ -4532,7 +4428,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) int i; if (n[0] == n[1]) return; - for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) continue; if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) @@ -4558,7 +4454,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; struct kmem_list3 *l3; const char *name; @@ -4592,17 +4488,17 @@ static int leaks_show(struct seq_file *m, void *p) name = cachep->name; if (n[0] == n[1]) { /* Increase the buffer size */ - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); if (!m->private) { /* Too bad, we are really out */ m->private = n; - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); return -ENOMEM; } *(unsigned long *)m->private = n[0] * 2; kfree(n); - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); /* Now make sure this entry will be retried */ m->count = m->size; return 0; @@ -4677,6 +4573,6 @@ size_t ksize(const void *objp) if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return obj_size(virt_to_cache(objp)); + return virt_to_cache(objp)->object_size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab.h b/mm/slab.h new file mode 100644 index 000000000000..db7848caaa25 --- /dev/null +++ b/mm/slab.h @@ -0,0 +1,33 @@ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ + PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; +extern struct list_head slab_caches; + +struct kmem_cache *__kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); + +#endif diff --git a/mm/slab_common.c b/mm/slab_common.c new file mode 100644 index 000000000000..aa3ca5bb01b5 --- /dev/null +++ b/mm/slab_common.c @@ -0,0 +1,120 @@ +/* + * Slab allocator functions that are independent of the allocator strategy + * + * (C) 2012 Christoph Lameter <cl@linux.com> + */ +#include <linux/slab.h> + +#include <linux/mm.h> +#include <linux/poison.h> +#include <linux/interrupt.h> +#include <linux/memory.h> +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/page.h> + +#include "slab.h" + +enum slab_state slab_state; +LIST_HEAD(slab_caches); +DEFINE_MUTEX(slab_mutex); + +/* + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s = NULL; + +#ifdef CONFIG_DEBUG_VM + if (!name || in_interrupt() || size < sizeof(void *) || + size > KMALLOC_MAX_SIZE) { + printk(KERN_ERR "kmem_cache_create(%s) integrity check" + " failed\n", name); + goto out; + } +#endif + + get_online_cpus(); + mutex_lock(&slab_mutex); + +#ifdef CONFIG_DEBUG_VM + list_for_each_entry(s, &slab_caches, list) { + char tmp; + int res; + + /* + * This happens when the module gets unloaded and doesn't + * destroy its slab cache and no-one else reuses the vmalloc + * area of the module. Print a warning. + */ + res = probe_kernel_address(s->name, tmp); + if (res) { + printk(KERN_ERR + "Slab cache with size %d has lost its name\n", + s->object_size); + continue; + } + + if (!strcmp(s->name, name)) { + printk(KERN_ERR "kmem_cache_create(%s): Cache name" + " already exists.\n", + name); + dump_stack(); + s = NULL; + goto oops; + } + } + + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ +#endif + + s = __kmem_cache_create(name, size, align, flags, ctor); + +#ifdef CONFIG_DEBUG_VM +oops: +#endif + mutex_unlock(&slab_mutex); + put_online_cpus(); + +#ifdef CONFIG_DEBUG_VM +out: +#endif + if (!s && (flags & SLAB_PANIC)) + panic("kmem_cache_create: Failed to create slab '%s'\n", name); + + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +int slab_is_available(void) +{ + return slab_state >= UP; +} diff --git a/mm/slob.c b/mm/slob.c index 8105be42cad1..45d4ca79933a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -59,6 +59,8 @@ #include <linux/kernel.h> #include <linux/slab.h> +#include "slab.h" + #include <linux/mm.h> #include <linux/swap.h> /* struct reclaim_state */ #include <linux/cache.h> @@ -92,36 +94,6 @@ struct slob_block { typedef struct slob_block slob_t; /* - * We use struct page fields to manage some slob allocation aspects, - * however to avoid the horrible mess in include/linux/mm_types.h, we'll - * just define our own struct page type variant here. - */ -struct slob_page { - union { - struct { - unsigned long flags; /* mandatory */ - atomic_t _count; /* mandatory */ - slobidx_t units; /* free units left in page */ - unsigned long pad[2]; - slob_t *free; /* first free slob_t in page */ - struct list_head list; /* linked list of free pages */ - }; - struct page page; - }; -}; -static inline void struct_slob_page_wrong_size(void) -{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } - -/* - * free_slob_page: call before a slob_page is returned to the page allocator. - */ -static inline void free_slob_page(struct slob_page *sp) -{ - reset_page_mapcount(&sp->page); - sp->page.mapping = NULL; -} - -/* * All partially free slob pages go on these lists. */ #define SLOB_BREAK1 256 @@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium); static LIST_HEAD(free_slob_large); /* - * is_slob_page: True for all slob pages (false for bigblock pages) - */ -static inline int is_slob_page(struct slob_page *sp) -{ - return PageSlab((struct page *)sp); -} - -static inline void set_slob_page(struct slob_page *sp) -{ - __SetPageSlab((struct page *)sp); -} - -static inline void clear_slob_page(struct slob_page *sp) -{ - __ClearPageSlab((struct page *)sp); -} - -static inline struct slob_page *slob_page(const void *addr) -{ - return (struct slob_page *)virt_to_page(addr); -} - -/* * slob_page_free: true for pages on free_slob_pages list. */ -static inline int slob_page_free(struct slob_page *sp) +static inline int slob_page_free(struct page *sp) { - return PageSlobFree((struct page *)sp); + return PageSlobFree(sp); } -static void set_slob_page_free(struct slob_page *sp, struct list_head *list) +static void set_slob_page_free(struct page *sp, struct list_head *list) { list_add(&sp->list, list); - __SetPageSlobFree((struct page *)sp); + __SetPageSlobFree(sp); } -static inline void clear_slob_page_free(struct slob_page *sp) +static inline void clear_slob_page_free(struct page *sp) { list_del(&sp->list); - __ClearPageSlobFree((struct page *)sp); + __ClearPageSlobFree(sp); } #define SLOB_UNIT sizeof(slob_t) @@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order) /* * Allocate a slob block within a given slob_page sp. */ -static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) +static void *slob_page_alloc(struct page *sp, size_t size, int align) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); - for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { + for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); if (align) { @@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) if (prev) set_slob(prev, slob_units(prev), next); else - sp->free = next; + sp->freelist = next; } else { /* fragment */ if (prev) set_slob(prev, slob_units(prev), cur + units); else - sp->free = cur + units; + sp->freelist = cur + units; set_slob(cur + units, avail - units, next); } @@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) */ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { - struct slob_page *sp; + struct page *sp; struct list_head *prev; struct list_head *slob_list; slob_t *b = NULL; @@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) * If there's a node specification, search for a partial * page with a matching node id in the freelist. */ - if (node != -1 && page_to_nid(&sp->page) != node) + if (node != -1 && page_to_nid(sp) != node) continue; #endif /* Enough room on this page? */ @@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) return NULL; - sp = slob_page(b); - set_slob_page(sp); + sp = virt_to_page(b); + __SetPageSlab(sp); spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); - sp->free = b; + sp->freelist = b; INIT_LIST_HEAD(&sp->list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); @@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) */ static void slob_free(void *block, int size) { - struct slob_page *sp; + struct page *sp; slob_t *prev, *next, *b = (slob_t *)block; slobidx_t units; unsigned long flags; @@ -402,7 +351,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = slob_page(block); + sp = virt_to_page(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -412,8 +361,8 @@ static void slob_free(void *block, int size) if (slob_page_free(sp)) clear_slob_page_free(sp); spin_unlock_irqrestore(&slob_lock, flags); - clear_slob_page(sp); - free_slob_page(sp); + __ClearPageSlab(sp); + reset_page_mapcount(sp); slob_free_pages(b, 0); return; } @@ -421,7 +370,7 @@ static void slob_free(void *block, int size) if (!slob_page_free(sp)) { /* This slob page is about to become partially free. Easy! */ sp->units = units; - sp->free = b; + sp->freelist = b; set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); @@ -441,15 +390,15 @@ static void slob_free(void *block, int size) */ sp->units += units; - if (b < sp->free) { - if (b + units == sp->free) { - units += slob_units(sp->free); - sp->free = slob_next(sp->free); + if (b < (slob_t *)sp->freelist) { + if (b + units == sp->freelist) { + units += slob_units(sp->freelist); + sp->freelist = slob_next(sp->freelist); } - set_slob(b, units, sp->free); - sp->free = b; + set_slob(b, units, sp->freelist); + sp->freelist = b; } else { - prev = sp->free; + prev = sp->freelist; next = slob_next(prev); while (b > next) { prev = next; @@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node); void kfree(const void *block) { - struct slob_page *sp; + struct page *sp; trace_kfree(_RET_IP_, block); @@ -530,43 +479,36 @@ void kfree(const void *block) return; kmemleak_free(block); - sp = slob_page(block); - if (is_slob_page(sp)) { + sp = virt_to_page(block); + if (PageSlab(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else - put_page(&sp->page); + put_page(sp); } EXPORT_SYMBOL(kfree); /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t ksize(const void *block) { - struct slob_page *sp; + struct page *sp; BUG_ON(!block); if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = slob_page(block); - if (is_slob_page(sp)) { + sp = virt_to_page(block); + if (PageSlab(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } else - return sp->page.private; + return sp->private; } EXPORT_SYMBOL(ksize); -struct kmem_cache { - unsigned int size, align; - unsigned long flags; - const char *name; - void (*ctor)(void *); -}; - -struct kmem_cache *kmem_cache_create(const char *name, size_t size, +struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *c; @@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, c->align = ARCH_SLAB_MINALIGN; if (c->align < align) c->align = align; - } else if (flags & SLAB_PANIC) - panic("Cannot create slab cache %s\n", name); - kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); + kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); + c->refcount = 1; + } return c; } -EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { @@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d) } EXPORT_SYMBOL(kmem_cache_shrink); -static unsigned int slob_ready __read_mostly; - -int slab_is_available(void) -{ - return slob_ready; -} - void __init kmem_cache_init(void) { - slob_ready = 1; + slab_state = UP; } void __init kmem_cache_init_late(void) { - /* Nothing to do */ + slab_state = FULL; } diff --git a/mm/slub.c b/mm/slub.c index ffe13fdf8144..e517d435e5dc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/slab.h> +#include "slab.h" #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmemcheck.h> @@ -35,13 +36,13 @@ /* * Lock order: - * 1. slub_lock (Global Semaphore) + * 1. slab_mutex (Global Mutex) * 2. node->list_lock * 3. slab_lock(page) (Only on some arches and for debugging) * - * slub_lock + * slab_mutex * - * The role of the slub_lock is to protect the list of all the slabs + * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not @@ -182,17 +183,6 @@ static int kmem_size = sizeof(struct kmem_cache); static struct notifier_block slab_notifier; #endif -static enum { - DOWN, /* No slab functionality available */ - PARTIAL, /* Kmem_cache_node works */ - UP, /* Everything works but does not show up in sysfs */ - SYSFS /* Sysfs up */ -} slab_state = DOWN; - -/* A list of all slab caches on the system */ -static DECLARE_RWSEM(slub_lock); -static LIST_HEAD(slab_caches); - /* * Tracking user of a slab. */ @@ -237,11 +227,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -int slab_is_available(void) -{ - return slab_state >= UP; -} - static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { return s->node[node]; @@ -311,7 +296,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; + return s->object_size; #endif /* @@ -609,11 +594,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); - print_section("Object ", p, min_t(unsigned long, s->objsize, + print_section("Object ", p, min_t(unsigned long, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p + s->objsize, - s->inuse - s->objsize); + print_section("Redzone ", p + s->object_size, + s->inuse - s->object_size); if (s->offset) off = s->offset + sizeof(void *); @@ -655,12 +640,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) u8 *p = object; if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->objsize - 1); - p[s->objsize - 1] = POISON_END; + memset(p, POISON_FREE, s->object_size - 1); + p[s->object_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, val, s->inuse - s->objsize); + memset(p + s->object_size, val, s->inuse - s->object_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -705,10 +690,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) * - * object + s->objsize + * object + s->object_size * Padding to reach word boundary. This is also used for Redzoning. * Padding is extended by another word if Redzoning is enabled and - * objsize == inuse. + * object_size == inuse. * * We fill with 0xbb (RED_INACTIVE) for inactive objects and with * 0xcc (RED_ACTIVE) for objects in use. @@ -727,7 +712,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object + s->size * Nothing is used beyond s->size. * - * If slabcaches are merged then the objsize and inuse boundaries are mostly + * If slabcaches are merged then the object_size and inuse boundaries are mostly * ignored. And therefore no slab options that rely on these boundaries * may be used with merged slabcaches. */ @@ -787,25 +772,25 @@ static int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { u8 *p = object; - u8 *endobject = object + s->objsize; + u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, val, s->inuse - s->objsize)) + endobject, val, s->inuse - s->object_size)) return 0; } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { + if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, page, p, "Alignment padding", - endobject, POISON_INUSE, s->inuse - s->objsize); + endobject, POISON_INUSE, s->inuse - s->object_size); } } if (s->flags & SLAB_POISON) { if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, - POISON_FREE, s->objsize - 1) || + POISON_FREE, s->object_size - 1) || !check_bytes_and_report(s, page, p, "Poison", - p + s->objsize - 1, POISON_END, 1))) + p + s->object_size - 1, POISON_END, 1))) return 0; /* * check_pad_bytes cleans up on its own. @@ -926,7 +911,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object ", (void *)object, s->objsize); + print_section("Object ", (void *)object, s->object_size); dump_stack(); } @@ -942,14 +927,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) lockdep_trace_alloc(flags); might_sleep_if(flags & __GFP_WAIT); - return should_failslab(s->objsize, flags, s->flags); + return should_failslab(s->object_size, flags, s->flags); } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -966,13 +951,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) unsigned long flags; local_irq_save(flags); - kmemcheck_slab_free(s, x, s->objsize); - debug_check_no_locks_freed(x, s->objsize); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); local_irq_restore(flags); } #endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->objsize); + debug_check_no_obj_freed(x, s->object_size); } /* @@ -1207,7 +1192,7 @@ out: __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long objsize, +static unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1237,7 +1222,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long objsize, +static inline unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1314,13 +1299,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(s, ORDER_FALLBACK); } - if (flags & __GFP_WAIT) - local_irq_disable(); - - if (!page) - return NULL; - - if (kmemcheck_enabled + if (kmemcheck_enabled && page && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); @@ -1336,6 +1315,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kmemcheck_mark_unallocated_pages(page, pages); } + if (flags & __GFP_WAIT) + local_irq_disable(); + if (!page) + return NULL; + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -1369,7 +1353,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; - page->flags |= 1 << PG_slab; + __SetPageSlab(page); start = page_address(page); @@ -1490,12 +1474,12 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* - * Lock slab, remove from the partial list and put the object into the - * per cpu freelist. + * Remove slab from the partial list, freeze it and + * return the pointer to the freelist. * * Returns a list of objects or NULL if it fails. * - * Must hold list_lock. + * Must hold list_lock since we modify the partial list. */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, @@ -1510,22 +1494,27 @@ static inline void *acquire_slab(struct kmem_cache *s, * The old freelist is the list of objects for the * per cpu allocation list. */ - do { - freelist = page->freelist; - counters = page->counters; - new.counters = counters; - if (mode) - new.inuse = page->objects; + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + if (mode) { + new.inuse = page->objects; + new.freelist = NULL; + } else { + new.freelist = freelist; + } - VM_BUG_ON(new.frozen); - new.frozen = 1; + VM_BUG_ON(new.frozen); + new.frozen = 1; - } while (!__cmpxchg_double_slab(s, page, + if (!__cmpxchg_double_slab(s, page, freelist, counters, - NULL, new.counters, - "lock and freeze")); + new.freelist, new.counters, + "acquire_slab")) + return NULL; remove_partial(n, page); + WARN_ON(!freelist); return freelist; } @@ -1559,12 +1548,10 @@ static void *get_partial_node(struct kmem_cache *s, if (!object) { c->page = page; - c->node = page_to_nid(page); stat(s, ALLOC_FROM_PARTIAL); object = t; available = page->objects - page->inuse; } else { - page->freelist = t; available = put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); } @@ -1579,7 +1566,7 @@ static void *get_partial_node(struct kmem_cache *s, /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, +static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA @@ -1614,7 +1601,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); + zonelist = node_zonelist(slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1728,14 +1715,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s) /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; - struct page *page = c->page; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); int lock = 0; enum slab_modes l = M_NONE, m = M_NONE; - void *freelist; void *nextfree; int tail = DEACTIVATE_TO_HEAD; struct page new; @@ -1746,11 +1731,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) tail = DEACTIVATE_TO_TAIL; } - c->tid = next_tid(c->tid); - c->page = NULL; - freelist = c->freelist; - c->freelist = NULL; - /* * Stage one: Free all available per cpu objects back * to the page freelist while it is still frozen. Leave the @@ -1876,21 +1856,31 @@ redo: } } -/* Unfreeze all the cpu partial slabs */ +/* + * Unfreeze all the cpu partial slabs. + * + * This function must be called with interrupt disabled. + */ static void unfreeze_partials(struct kmem_cache *s) { - struct kmem_cache_node *n = NULL; + struct kmem_cache_node *n = NULL, *n2 = NULL; struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); struct page *page, *discard_page = NULL; while ((page = c->partial)) { - enum slab_modes { M_PARTIAL, M_FREE }; - enum slab_modes l, m; struct page new; struct page old; c->partial = page->next; - l = M_FREE; + + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } do { @@ -1903,43 +1893,17 @@ static void unfreeze_partials(struct kmem_cache *s) new.frozen = 0; - if (!new.inuse && (!n || n->nr_partial > s->min_partial)) - m = M_FREE; - else { - struct kmem_cache_node *n2 = get_node(s, - page_to_nid(page)); - - m = M_PARTIAL; - if (n != n2) { - if (n) - spin_unlock(&n->list_lock); - - n = n2; - spin_lock(&n->list_lock); - } - } - - if (l != m) { - if (l == M_PARTIAL) { - remove_partial(n, page); - stat(s, FREE_REMOVE_PARTIAL); - } else { - add_partial(n, page, - DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - - l = m; - } - - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")); - if (m == M_FREE) { + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { page->next = discard_page; discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } } @@ -2008,7 +1972,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c); + deactivate_slab(s, c->page, c->freelist); + + c->tid = next_tid(c->tid); + c->page = NULL; + c->freelist = NULL; } /* @@ -2040,7 +2008,7 @@ static bool has_cpu_slab(int cpu, void *info) struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return !!(c->page); + return c->page || c->partial; } static void flush_all(struct kmem_cache *s) @@ -2052,10 +2020,10 @@ static void flush_all(struct kmem_cache *s) * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct kmem_cache_cpu *c, int node) +static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && c->node != node) + if (node != NUMA_NO_NODE && page_to_nid(page) != node) return 0; #endif return 1; @@ -2098,10 +2066,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->objsize, + "default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); - if (oo_order(s->min) > get_order(s->objsize)) + if (oo_order(s->min) > get_order(s->object_size)) printk(KERN_WARNING " %s debugging increased min order, use " "slub_debug=O to disable.\n", s->name); @@ -2127,10 +2095,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu **pc) { - void *object; - struct kmem_cache_cpu *c; - struct page *page = new_slab(s, flags, node); + void *freelist; + struct kmem_cache_cpu *c = *pc; + struct page *page; + freelist = get_partial(s, flags, node, c); + + if (freelist) + return freelist; + + page = new_slab(s, flags, node); if (page) { c = __this_cpu_ptr(s->cpu_slab); if (c->page) @@ -2140,17 +2114,16 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, * No other reference to the page yet so we can * muck around with it freely without cmpxchg */ - object = page->freelist; + freelist = page->freelist; page->freelist = NULL; stat(s, ALLOC_SLAB); - c->node = page_to_nid(page); c->page = page; *pc = c; } else - object = NULL; + freelist = NULL; - return object; + return freelist; } /* @@ -2160,6 +2133,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, * The page is still frozen if the return value is not NULL. * * If this function returns NULL then the page has been unfrozen. + * + * This function must be called with interrupt disabled. */ static inline void *get_freelist(struct kmem_cache *s, struct page *page) { @@ -2170,13 +2145,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) do { freelist = page->freelist; counters = page->counters; + new.counters = counters; VM_BUG_ON(!new.frozen); new.inuse = page->objects; new.frozen = freelist != NULL; - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, freelist, counters, NULL, new.counters, "get_freelist")); @@ -2203,7 +2179,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { - void **object; + void *freelist; + struct page *page; unsigned long flags; local_irq_save(flags); @@ -2216,25 +2193,29 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - if (!c->page) + page = c->page; + if (!page) goto new_slab; redo: - if (unlikely(!node_match(c, node))) { + + if (unlikely(!node_match(page, node))) { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, c); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; goto new_slab; } /* must check again c->freelist in case of cpu migration or IRQ */ - object = c->freelist; - if (object) + freelist = c->freelist; + if (freelist) goto load_freelist; stat(s, ALLOC_SLOWPATH); - object = get_freelist(s, c->page); + freelist = get_freelist(s, page); - if (!object) { + if (!freelist) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2243,50 +2224,50 @@ redo: stat(s, ALLOC_REFILL); load_freelist: - c->freelist = get_freepointer(s, object); + /* + * freelist is pointing to the list of objects to be used. + * page is pointing to the page from which the objects are obtained. + * That page must be frozen for per cpu allocations to work. + */ + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_irq_restore(flags); - return object; + return freelist; new_slab: if (c->partial) { - c->page = c->partial; - c->partial = c->page->next; - c->node = page_to_nid(c->page); + page = c->page = c->partial; + c->partial = page->next; stat(s, CPU_PARTIAL_ALLOC); c->freelist = NULL; goto redo; } - /* Then do expensive stuff like retrieving pages from the partial lists */ - object = get_partial(s, gfpflags, node, c); - - if (unlikely(!object)) { + freelist = new_slab_objects(s, gfpflags, node, &c); - object = new_slab_objects(s, gfpflags, node, &c); + if (unlikely(!freelist)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); - if (unlikely(!object)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - - local_irq_restore(flags); - return NULL; - } + local_irq_restore(flags); + return NULL; } + page = c->page; if (likely(!kmem_cache_debug(s))) goto load_freelist; /* Only entered in the debug case */ - if (!alloc_debug_processing(s, c->page, object, addr)) + if (!alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - c->freelist = get_freepointer(s, object); - deactivate_slab(s, c); - c->node = NUMA_NO_NODE; + deactivate_slab(s, page, get_freepointer(s, freelist)); + c->page = NULL; + c->freelist = NULL; local_irq_restore(flags); - return object; + return freelist; } /* @@ -2304,6 +2285,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; + struct page *page; unsigned long tid; if (slab_pre_alloc_hook(s, gfpflags)) @@ -2329,7 +2311,8 @@ redo: barrier(); object = c->freelist; - if (unlikely(!object || !node_match(c, node))) + page = c->page; + if (unlikely(!object || !node_match(page, node))) object = __slab_alloc(s, gfpflags, node, addr, c); @@ -2361,7 +2344,7 @@ redo: } if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, s->objsize); + memset(object, 0, s->object_size); slab_post_alloc_hook(s, gfpflags, object); @@ -2372,7 +2355,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); return ret; } @@ -2402,7 +2385,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, - s->objsize, s->size, gfpflags, node); + s->object_size, s->size, gfpflags, node); return ret; } @@ -2766,7 +2749,7 @@ static unsigned long calculate_alignment(unsigned long flags, } static void -init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -2836,7 +2819,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmem_cache_node); + init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); add_partial(n, page, DEACTIVATE_TO_HEAD); @@ -2876,7 +2859,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) } s->node[node] = n; - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); } return 1; } @@ -2897,7 +2880,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->objsize; + unsigned long size = s->object_size; unsigned long align = s->align; int order; @@ -2926,7 +2909,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * end of the object and the free pointer. If not then add an * additional word to have some bytes to store Redzone information. */ - if ((flags & SLAB_RED_ZONE) && size == s->objsize) + if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); #endif @@ -2974,7 +2957,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * user specified and the dynamic determination of cache line size * on bootup. */ - align = calculate_alignment(flags, align, s->objsize); + align = calculate_alignment(flags, align, s->object_size); s->align = align; /* @@ -3022,7 +3005,7 @@ static int kmem_cache_open(struct kmem_cache *s, memset(s, 0, kmem_size); s->name = name; s->ctor = ctor; - s->objsize = size; + s->object_size = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); s->reserved = 0; @@ -3037,7 +3020,7 @@ static int kmem_cache_open(struct kmem_cache *s, * Disable debugging flags that store metadata if the min slab * order increased. */ - if (get_order(s->size) > get_order(s->objsize)) { + if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; if (!calculate_sizes(s, -1)) @@ -3111,7 +3094,7 @@ error: */ unsigned int kmem_cache_size(struct kmem_cache *s) { - return s->objsize; + return s->object_size; } EXPORT_SYMBOL(kmem_cache_size); @@ -3189,11 +3172,11 @@ static inline int kmem_cache_close(struct kmem_cache *s) */ void kmem_cache_destroy(struct kmem_cache *s) { - down_write(&slub_lock); + mutex_lock(&slab_mutex); s->refcount--; if (!s->refcount) { list_del(&s->list); - up_write(&slub_lock); + mutex_unlock(&slab_mutex); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -3203,7 +3186,7 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); sysfs_slab_remove(s); } else - up_write(&slub_lock); + mutex_unlock(&slab_mutex); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3265,7 +3248,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, /* * This function is called with IRQs disabled during early-boot on - * single CPU so there's no need to take slub_lock here. + * single CPU so there's no need to take slab_mutex here. */ if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) @@ -3550,10 +3533,10 @@ static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) kmem_cache_shrink(s); - up_read(&slub_lock); + mutex_unlock(&slab_mutex); return 0; } @@ -3574,7 +3557,7 @@ static void slab_mem_offline_callback(void *arg) if (offline_node < 0) return; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { n = get_node(s, offline_node); if (n) { @@ -3590,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg) kmem_cache_free(kmem_cache_node, n); } } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); } static int slab_mem_going_online_callback(void *arg) @@ -3613,7 +3596,7 @@ static int slab_mem_going_online_callback(void *arg) * allocate a kmem_cache_node structure in order to bring the node * online. */ - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { /* * XXX: kmem_cache_alloc_node will fallback to other nodes @@ -3625,11 +3608,11 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); s->node[nid] = n; } out: - up_read(&slub_lock); + mutex_unlock(&slab_mutex); return ret; } @@ -3840,11 +3823,11 @@ void __init kmem_cache_init(void) if (s && s->size) { char *name = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", s->objsize); + "dma-kmalloc-%d", s->object_size); BUG_ON(!name); kmalloc_dma_caches[i] = create_kmalloc_cache(name, - s->objsize, SLAB_CACHE_DMA); + s->object_size, SLAB_CACHE_DMA); } } #endif @@ -3921,16 +3904,12 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *kmem_cache_create(const char *name, size_t size, +struct kmem_cache *__kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; char *n; - if (WARN_ON(!name)) - return NULL; - - down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { s->refcount++; @@ -3938,49 +3917,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->objsize = max(s->objsize, (int)size); + s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); if (sysfs_slab_alias(s, name)) { s->refcount--; - goto err; + return NULL; } - up_write(&slub_lock); return s; } n = kstrdup(name, GFP_KERNEL); if (!n) - goto err; + return NULL; s = kmalloc(kmem_size, GFP_KERNEL); if (s) { if (kmem_cache_open(s, n, size, align, flags, ctor)) { + int r; + list_add(&s->list, &slab_caches); - up_write(&slub_lock); - if (sysfs_slab_add(s)) { - down_write(&slub_lock); - list_del(&s->list); - kfree(n); - kfree(s); - goto err; - } - return s; + mutex_unlock(&slab_mutex); + r = sysfs_slab_add(s); + mutex_lock(&slab_mutex); + + if (!r) + return s; + + list_del(&s->list); + kmem_cache_close(s); } - kfree(n); kfree(s); } -err: - up_write(&slub_lock); - - if (flags & SLAB_PANIC) - panic("Cannot create slabcache %s\n", name); - else - s = NULL; - return s; + kfree(n); + return NULL; } -EXPORT_SYMBOL(kmem_cache_create); #ifdef CONFIG_SMP /* @@ -3999,13 +3971,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); break; default: break; @@ -4497,30 +4469,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - int node = ACCESS_ONCE(c->node); + int node; struct page *page; - if (node < 0) - continue; page = ACCESS_ONCE(c->page); - if (page) { - if (flags & SO_TOTAL) - x = page->objects; - else if (flags & SO_OBJECTS) - x = page->inuse; - else - x = 1; + if (!page) + continue; - total += x; - nodes[node] += x; - } - page = c->partial; + node = page_to_nid(page); + if (flags & SO_TOTAL) + x = page->objects; + else if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + total += x; + nodes[node] += x; + + page = ACCESS_ONCE(c->partial); if (page) { x = page->pobjects; total += x; nodes[node] += x; } + per_cpu[node]++; } } @@ -4620,7 +4593,7 @@ SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objsize); + return sprintf(buf, "%d\n", s->object_size); } SLAB_ATTR_RO(object_size); @@ -5283,7 +5256,7 @@ static int sysfs_slab_add(struct kmem_cache *s) const char *name; int unmergeable; - if (slab_state < SYSFS) + if (slab_state < FULL) /* Defer until later */ return 0; @@ -5328,7 +5301,7 @@ static int sysfs_slab_add(struct kmem_cache *s) static void sysfs_slab_remove(struct kmem_cache *s) { - if (slab_state < SYSFS) + if (slab_state < FULL) /* * Sysfs has not been setup yet so no need to remove the * cache from sysfs. @@ -5356,7 +5329,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { struct saved_alias *al; - if (slab_state == SYSFS) { + if (slab_state == FULL) { /* * If we have a leftover link then remove it. */ @@ -5380,16 +5353,16 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; - down_write(&slub_lock); + mutex_lock(&slab_mutex); slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { - up_write(&slub_lock); + mutex_unlock(&slab_mutex); printk(KERN_ERR "Cannot register slab subsystem.\n"); return -ENOSYS; } - slab_state = SYSFS; + slab_state = FULL; list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); @@ -5405,11 +5378,11 @@ static int __init slab_sysfs_init(void) err = sysfs_slab_alias(al->s, al->name); if (err) printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", s->name); + " %s to sysfs\n", al->name); kfree(al); } - up_write(&slub_lock); + mutex_unlock(&slab_mutex); resiliency_test(); return 0; } @@ -5424,7 +5397,7 @@ __initcall(slab_sysfs_init); static void print_slabinfo_header(struct seq_file *m) { seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " + seq_puts(m, "# name <active_objs> <num_objs> <object_size> " "<objperslab> <pagesperslab>"); seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); @@ -5435,7 +5408,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; - down_read(&slub_lock); + mutex_lock(&slab_mutex); if (!n) print_slabinfo_header(m); @@ -5449,7 +5422,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { - up_read(&slub_lock); + mutex_unlock(&slab_mutex); } static int s_show(struct seq_file *m, void *p) diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364deb..c7bb952400c8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -273,10 +273,11 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long count) + unsigned long size) { - unsigned long section_nr; - + unsigned long goal, limit; + unsigned long *p; + int nid; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while @@ -287,8 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size() * count, section_nr); + goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, goal, limit); + if (!p && limit) { + limit = 0; + goto again; + } + return p; } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -332,9 +342,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) #else static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long count) + unsigned long size) { - return NULL; + return alloc_bootmem_node_nopanic(pgdat, size); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -352,13 +362,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, int size = usemap_size(); usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - usemap_count); + size * usemap_count); if (!usemap) { - usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); - if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); - return; - } + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return; } for (pnum = pnum_begin; pnum < pnum_end; pnum++) { diff --git a/mm/swap.c b/mm/swap.c index 5c13f1338972..4e7e2ec67078 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - unsigned long flags; struct zone *zone = page_zone(page); + struct lruvec *lruvec; + unsigned long flags; spin_lock_irqsave(&zone->lru_lock, flags); + lruvec = mem_cgroup_page_lruvec(page, zone); VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru_list(zone, page, page_off_lru(page)); + del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } } @@ -82,6 +84,25 @@ static void put_compound_page(struct page *page) if (likely(page != page_head && get_page_unless_zero(page_head))) { unsigned long flags; + + /* + * THP can not break up slab pages so avoid taking + * compound_lock(). Slab performs non-atomic bit ops + * on page->flags for better performance. In particular + * slab_unlock() in slub used to be a hot path. It is + * still hot on arches that do not support + * this_cpu_cmpxchg_double(). + */ + if (PageSlab(page_head)) { + if (PageTail(page)) { + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + + atomic_dec(&page->_mapcount); + goto skip_lock_tail; + } else + goto skip_lock; + } /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time @@ -92,10 +113,10 @@ static void put_compound_page(struct page *page) if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); - VM_BUG_ON(PageHead(page_head)); +skip_lock: if (put_page_testzero(page_head)) __put_single_page(page_head); - out_put_single: +out_put_single: if (put_page_testzero(page)) __put_single_page(page); return; @@ -115,6 +136,8 @@ static void put_compound_page(struct page *page) VM_BUG_ON(atomic_read(&page_head->_count) <= 0); VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); + +skip_lock_tail: if (put_page_testzero(page_head)) { if (PageHead(page_head)) __put_compound_page(page_head); @@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page) struct page *page_head = compound_trans_head(page); if (likely(page != page_head && get_page_unless_zero(page_head))) { + + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head)) { + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + return true; + } else { + put_page(page_head); + return false; + } + } + /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time @@ -202,11 +237,12 @@ void put_pages_list(struct list_head *pages) EXPORT_SYMBOL(put_pages_list); static void pagevec_lru_move_fn(struct pagevec *pvec, - void (*move_fn)(struct page *page, void *arg), - void *arg) + void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), + void *arg) { int i; struct zone *zone = NULL; + struct lruvec *lruvec; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { @@ -220,7 +256,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, spin_lock_irqsave(&zone->lru_lock, flags); } - (*move_fn)(page, arg); + lruvec = mem_cgroup_page_lruvec(page, zone); + (*move_fn)(page, lruvec, arg); } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -228,16 +265,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, pagevec_reinit(pvec); } -static void pagevec_move_tail_fn(struct page *page, void *arg) +static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, + void *arg) { int *pgmoved = arg; if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - struct lruvec *lruvec; - - lruvec = mem_cgroup_lru_move_lists(page_zone(page), - page, lru, lru); list_move_tail(&page->lru, &lruvec->lists[lru]); (*pgmoved)++; } @@ -276,41 +310,30 @@ void rotate_reclaimable_page(struct page *page) } } -static void update_page_reclaim_stat(struct zone *zone, struct page *page, +static void update_page_reclaim_stat(struct lruvec *lruvec, int file, int rotated) { - struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; - struct zone_reclaim_stat *memcg_reclaim_stat; - - memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; reclaim_stat->recent_scanned[file]++; if (rotated) reclaim_stat->recent_rotated[file]++; - - if (!memcg_reclaim_stat) - return; - - memcg_reclaim_stat->recent_scanned[file]++; - if (rotated) - memcg_reclaim_stat->recent_rotated[file]++; } -static void __activate_page(struct page *page, void *arg) +static void __activate_page(struct page *page, struct lruvec *lruvec, + void *arg) { - struct zone *zone = page_zone(page); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); int lru = page_lru_base_type(page); - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGACTIVATE); + add_page_to_lru_list(page, lruvec, lru); - update_page_reclaim_stat(zone, page, file, 1); + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(lruvec, file, 1); } } @@ -347,7 +370,7 @@ void activate_page(struct page *page) struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - __activate_page(page, NULL); + __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); spin_unlock_irq(&zone->lru_lock); } #endif @@ -414,11 +437,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru) void add_page_to_unevictable_list(struct page *page) { struct zone *zone = page_zone(page); + struct lruvec *lruvec; spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); SetPageUnevictable(page); SetPageLRU(page); - add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); spin_unlock_irq(&zone->lru_lock); } @@ -443,11 +468,11 @@ void add_page_to_unevictable_list(struct page *page) * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate_fn(struct page *page, void *arg) +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) { int lru, file; bool active; - struct zone *zone = page_zone(page); if (!PageLRU(page)) return; @@ -460,13 +485,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) return; active = PageActive(page); - file = page_is_file_cache(page); lru = page_lru_base_type(page); - del_page_from_lru_list(zone, page, lru + active); + + del_page_from_lru_list(page, lruvec, lru + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(zone, page, lru); + add_page_to_lru_list(page, lruvec, lru); if (PageWriteback(page) || PageDirty(page)) { /* @@ -476,19 +501,17 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ SetPageReclaim(page); } else { - struct lruvec *lruvec; /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); list_move_tail(&page->lru, &lruvec->lists[lru]); __count_vm_event(PGROTATED); } if (active) __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(zone, page, file, 0); + update_page_reclaim_stat(lruvec, file, 0); } /* @@ -588,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold) int i; LIST_HEAD(pages_to_free); struct zone *zone = NULL; + struct lruvec *lruvec; unsigned long uninitialized_var(flags); for (i = 0; i < nr; i++) { @@ -615,9 +639,11 @@ void release_pages(struct page **pages, int nr, int cold) zone = pagezone; spin_lock_irqsave(&zone->lru_lock, flags); } + + lruvec = mem_cgroup_page_lruvec(page, zone); VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru_list(zone, page, page_off_lru(page)); + del_page_from_lru_list(page, lruvec, page_off_lru(page)); } list_add(&page->lru, &pages_to_free); @@ -649,8 +675,8 @@ EXPORT_SYMBOL(__pagevec_release); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ -void lru_add_page_tail(struct zone* zone, - struct page *page, struct page *page_tail) +void lru_add_page_tail(struct page *page, struct page *page_tail, + struct lruvec *lruvec) { int uninitialized_var(active); enum lru_list lru; @@ -659,7 +685,8 @@ void lru_add_page_tail(struct zone* zone, VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); VM_BUG_ON(PageLRU(page_tail)); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); + VM_BUG_ON(NR_CPUS != 1 && + !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); SetPageLRU(page_tail); @@ -688,20 +715,20 @@ void lru_add_page_tail(struct zone* zone, * Use the standard add function to put page_tail on the list, * but then correct its position so they all end up in order. */ - add_page_to_lru_list(zone, page_tail, lru); + add_page_to_lru_list(page_tail, lruvec, lru); list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } if (!PageUnevictable(page)) - update_page_reclaim_stat(zone, page_tail, file, active); + update_page_reclaim_stat(lruvec, file, active); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __pagevec_lru_add_fn(struct page *page, void *arg) +static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, + void *arg) { enum lru_list lru = (enum lru_list)arg; - struct zone *zone = page_zone(page); int file = is_file_lru(lru); int active = is_active_lru(lru); @@ -712,8 +739,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) SetPageLRU(page); if (active) SetPageActive(page); - add_page_to_lru_list(zone, page, lru); - update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(page, lruvec, lru); + update_page_reclaim_stat(lruvec, file, active); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..71373d03fcee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -31,6 +31,8 @@ #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); if ((p->flags & SWP_BLKDEV) && disk->fops->swap_slot_free_notify) disk->fops->swap_slot_free_notify(p->bdev, offset); @@ -601,7 +604,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static inline int page_swapcount(struct page *page) +int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; @@ -717,37 +720,6 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_count_swap_user - count the user of a swap entry - * @ent: the swap entry to be checked - * @pagep: the pointer for the swap cache page of the entry to be stored - * - * Returns the number of the user of the swap entry. The number is valid only - * for swaps of anonymous pages. - * If the entry is found on swap cache, the page is stored to pagep with - * refcount of it being incremented. - */ -int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) -{ - struct page *page; - struct swap_info_struct *p; - int count = 0; - - page = find_get_page(&swapper_space, ent.val); - if (page) - count += page_mapcount(page); - p = swap_info_get(ent); - if (p) { - count += swap_count(p->swap_map[swp_offset(ent)]); - spin_unlock(&swap_lock); - } - - *pagep = page; - return count; -} -#endif - #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). @@ -1016,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm, } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1046,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } count = si->swap_map[i]; if (count && swap_count(count) != SWAP_MAP_BAD) break; @@ -1057,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; @@ -1091,7 +1074,7 @@ static int try_to_unuse(unsigned int type) * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1258,6 +1241,10 @@ static int try_to_unuse(unsigned int type) * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1517,7 +1504,8 @@ bad_bmap: } static void enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + unsigned long *frontswap_map) { int i, prev; @@ -1527,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; + frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; nr_swap_pages += p->pages; total_swap_pages += p->pages; @@ -1543,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; + frontswap_init(p->type); spin_unlock(&swap_lock); } @@ -1616,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); - err = try_to_unuse(type); + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { @@ -1627,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) * sys_swapoff for this swap_info_struct at this point. */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map); + enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); goto out_dput; } @@ -1653,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + frontswap_invalidate_area(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + vfree(frontswap_map_get(p)); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -1924,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are three limiting factors: 1) the number + * device. There are two limiting factors: 1) the number * of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte as defined by the - * the different architectures, and 3) the number of free bits - * in an exceptional radix_tree entry. In order to find the + * different architectures. In order to find the * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. Then the same is done for a radix_tree entry. + * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))); - maxpages = swp_offset(radix_to_swp_entry( - swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; - + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ @@ -2019,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2102,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(maxpages / sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2117,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map); + enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); diff --git a/mm/thrash.c b/mm/thrash.c deleted file mode 100644 index 57ad495dbd54..000000000000 --- a/mm/thrash.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * mm/thrash.c - * - * Copyright (C) 2004, Red Hat, Inc. - * Copyright (C) 2004, Rik van Riel <riel@redhat.com> - * Released under the GPL, see the file COPYING for details. - * - * Simple token based thrashing protection, using the algorithm - * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html - * - * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> - * Improved algorithm to pass token: - * Each task has a priority which is incremented if it contended - * for the token in an interval less than its previous attempt. - * If the token is acquired, that task's priority is boosted to prevent - * the token from bouncing around too often and to let the task make - * some progress in its execution. - */ - -#include <linux/jiffies.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/swap.h> -#include <linux/memcontrol.h> - -#include <trace/events/vmscan.h> - -#define TOKEN_AGING_INTERVAL (0xFF) - -static DEFINE_SPINLOCK(swap_token_lock); -struct mm_struct *swap_token_mm; -static struct mem_cgroup *swap_token_memcg; - -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) -{ - struct mem_cgroup *memcg; - - memcg = try_get_mem_cgroup_from_mm(mm); - if (memcg) - css_put(mem_cgroup_css(memcg)); - - return memcg; -} -#else -static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) -{ - return NULL; -} -#endif - -void grab_swap_token(struct mm_struct *mm) -{ - int current_interval; - unsigned int old_prio = mm->token_priority; - static unsigned int global_faults; - static unsigned int last_aging; - - global_faults++; - - current_interval = global_faults - mm->faultstamp; - - if (!spin_trylock(&swap_token_lock)) - return; - - /* First come first served */ - if (!swap_token_mm) - goto replace_token; - - /* - * Usually, we don't need priority aging because long interval faults - * makes priority decrease quickly. But there is one exception. If the - * token owner task is sleeping, it never make long interval faults. - * Thus, we need a priority aging mechanism instead. The requirements - * of priority aging are - * 1) An aging interval is reasonable enough long. Too short aging - * interval makes quick swap token lost and decrease performance. - * 2) The swap token owner task have to get priority aging even if - * it's under sleep. - */ - if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { - swap_token_mm->token_priority /= 2; - last_aging = global_faults; - } - - if (mm == swap_token_mm) { - mm->token_priority += 2; - goto update_priority; - } - - if (current_interval < mm->last_interval) - mm->token_priority++; - else { - if (likely(mm->token_priority > 0)) - mm->token_priority--; - } - - /* Check if we deserve the token */ - if (mm->token_priority > swap_token_mm->token_priority) - goto replace_token; - -update_priority: - trace_update_swap_token_priority(mm, old_prio, swap_token_mm); - -out: - mm->faultstamp = global_faults; - mm->last_interval = current_interval; - spin_unlock(&swap_token_lock); - return; - -replace_token: - mm->token_priority += 2; - trace_replace_swap_token(swap_token_mm, mm); - swap_token_mm = mm; - swap_token_memcg = swap_token_memcg_from_mm(mm); - last_aging = global_faults; - goto out; -} - -/* Called on process exit. */ -void __put_swap_token(struct mm_struct *mm) -{ - spin_lock(&swap_token_lock); - if (likely(mm == swap_token_mm)) { - trace_put_swap_token(swap_token_mm); - swap_token_mm = NULL; - swap_token_memcg = NULL; - } - spin_unlock(&swap_token_lock); -} - -static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) -{ - if (!a) - return true; - if (!b) - return true; - if (a == b) - return true; - return false; -} - -void disable_swap_token(struct mem_cgroup *memcg) -{ - /* memcg reclaim don't disable unrelated mm token. */ - if (match_memcg(memcg, swap_token_memcg)) { - spin_lock(&swap_token_lock); - if (match_memcg(memcg, swap_token_memcg)) { - trace_disable_swap_token(swap_token_mm); - swap_token_mm = NULL; - swap_token_memcg = NULL; - } - spin_unlock(&swap_token_lock); - } -} diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df6..75801acdaac7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize) } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) -{ - struct address_space *mapping = inode->i_mapping; - loff_t holebegin = round_up(lstart, PAGE_SIZE); - loff_t holelen = 1 + lend - holebegin; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - inode_dio_wait(inode); - unmap_mapping_range(mapping, holebegin, holelen, 1); - inode->i_op->truncate_range(inode, lstart, lend); - /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, holebegin, holelen, 1); - mutex_unlock(&inode->i_mutex); - - return 0; -} - /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode diff --git a/mm/util.c b/mm/util.c index ae962b31de88..8c7265afa29f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,6 +4,7 @@ #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/security.h> #include <asm/uaccess.h> #include "internal.h" @@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); +unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) +{ + unsigned long ret; + struct mm_struct *mm = current->mm; + + ret = security_mmap_file(file, prot, flag); + if (!ret) { + down_write(&mm->mmap_sem); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); + up_write(&mm->mmap_sem); + } + return ret; +} + +unsigned long vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + if (unlikely(offset + PAGE_ALIGN(len) < offset)) + return -EINVAL; + if (unlikely(offset & ~PAGE_MASK)) + return -EINVAL; + + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +} +EXPORT_SYMBOL(vm_mmap); + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94dff883b449..e03f4c7307a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1185,9 +1185,10 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); - va->flags = tmp->flags | VM_VM_AREA; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; + va->vm = tmp; __insert_vmap_area(va); } @@ -1279,7 +1280,7 @@ DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, void *caller) + unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; @@ -1305,7 +1306,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm) } static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, void *caller) + unsigned long flags, const void *caller) { setup_vmalloc_vm(vm, va, flags, caller); insert_vmalloc_vmlist(vm); @@ -1313,7 +1314,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; @@ -1374,7 +1375,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area); struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, - void *caller) + const void *caller) { return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, caller); @@ -1396,13 +1397,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, - void *caller) + const void *caller) { return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); } -static struct vm_struct *find_vm_area(const void *addr) +/** + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. + */ +struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; @@ -1567,9 +1576,9 @@ EXPORT_SYMBOL(vmap); static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); + int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller) + pgprot_t prot, int node, const void *caller) { const int order = 0; struct page **pages; @@ -1642,7 +1651,7 @@ fail: */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller) + pgprot_t prot, int node, const void *caller) { struct vm_struct *area; void *addr; @@ -1698,7 +1707,7 @@ fail: */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) + int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, prot, node, caller); @@ -1974,9 +1983,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. - * @buf should be kernel's buffer. Because this function uses KM_USER0, - * the caller should guarantee KM_USER0 is not used. + * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). @@ -2050,9 +2057,7 @@ finished: * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. - * @buf should be kernel's buffer. Because this function uses KM_USER0, - * the caller should guarantee KM_USER0 is not used. + * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vwrite() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). @@ -2375,8 +2380,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); - vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); + vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); + vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; diff --git a/mm/vmscan.c b/mm/vmscan.c index 33dc256033b5..347b3ff2a478 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -53,24 +53,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> -/* - * reclaim_mode determines how the inactive list is shrunk - * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages - * RECLAIM_MODE_ASYNC: Do not block - * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback - * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference - * page from the LRU and reclaim all pages within a - * naturally aligned range - * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of - * order-0 pages and then compact the zone - */ -typedef unsigned __bitwise__ reclaim_mode_t; -#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) -#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) -#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) -#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) -#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -96,11 +78,8 @@ struct scan_control { int order; - /* - * Intend to reclaim enough continuous memory rather than reclaim - * enough amount of memory. i.e, mode for high order allocation. - */ - reclaim_mode_t reclaim_mode; + /* Scan (total_size >> priority) pages at once */ + int priority; /* * The memory cgroup that hit its limit and as a result is the @@ -115,11 +94,6 @@ struct scan_control { nodemask_t *nodemask; }; -struct mem_cgroup_zone { - struct mem_cgroup *mem_cgroup; - struct zone *zone; -}; - #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -164,44 +138,21 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool scanning_global_lru(struct mem_cgroup_zone *mz) -{ - return !mz->mem_cgroup; -} #else static bool global_reclaim(struct scan_control *sc) { return true; } - -static bool scanning_global_lru(struct mem_cgroup_zone *mz) -{ - return true; -} #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) +static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { - if (!scanning_global_lru(mz)) - return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); + if (!mem_cgroup_disabled()) + return mem_cgroup_get_lru_size(lruvec, lru); - return &mz->zone->reclaim_stat; + return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); } -static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, - enum lru_list lru) -{ - if (!scanning_global_lru(mz)) - return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, - zone_to_nid(mz->zone), - zone_idx(mz->zone), - BIT(lru)); - - return zone_page_state(mz->zone, NR_LRU_BASE + lru); -} - - /* * Add a shrinker callback to be called from the vm */ @@ -364,39 +315,6 @@ out: return ret; } -static void set_reclaim_mode(int priority, struct scan_control *sc, - bool sync) -{ - reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; - - /* - * Initially assume we are entering either lumpy reclaim or - * reclaim/compaction.Depending on the order, we will either set the - * sync mode or just reclaim order-0 pages later. - */ - if (COMPACTION_BUILD) - sc->reclaim_mode = RECLAIM_MODE_COMPACTION; - else - sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; - - /* - * Avoid using lumpy reclaim or reclaim/compaction if possible by - * restricting when its set to either costly allocations or when - * under memory pressure - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->reclaim_mode |= syncmode; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->reclaim_mode |= syncmode; - else - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; -} - -static void reset_reclaim_mode(struct scan_control *sc) -{ - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; -} - static inline int is_page_cache_freeable(struct page *page) { /* @@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, return 1; if (bdi == current->backing_dev_info) return 1; - - /* lumpy reclaim for hugepage often need a lot of write */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - return 1; return 0; } @@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->reclaim_mode)); + trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -701,19 +614,15 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, - struct mem_cgroup_zone *mz, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, + &vm_flags); referenced_page = TestClearPageReferenced(page); - /* Lumpy reclaim - ignore references */ - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - return PAGEREF_RECLAIM; - /* * Mlock lost the isolation race with us. Let try_to_unmap() * move the page to the unevictable list. @@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; if (referenced_ptes) { - if (PageAnon(page)) + if (PageSwapBacked(page)) return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table @@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct mem_cgroup_zone *mz, + struct zone *zone, struct scan_control *sc, - int priority, unsigned long *ret_nr_dirty, unsigned long *ret_nr_writeback) { @@ -794,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != mz->zone); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -813,22 +721,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageWriteback(page)) { nr_writeback++; - /* - * Synchronous reclaim cannot queue pages for - * writeback due to the possibility of stack overflow - * but if it encounters a page under writeback, wait - * for the IO to complete. - */ - if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && - may_enter_fs) - wait_on_page_writeback(page); - else { - unlock_page(page); - goto keep_lumpy; - } + unlock_page(page); + goto keep; } - references = page_check_references(page, mz, sc); + references = page_check_references(page, sc); switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -879,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * unless under significant pressure. */ if (page_is_file_cache(page) && - (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { + (!current_is_kswapd() || + sc->priority >= DEF_PRIORITY - 2)) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -908,7 +806,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case PAGE_SUCCESS: if (PageWriteback(page)) - goto keep_lumpy; + goto keep; if (PageDirty(page)) goto keep; @@ -994,7 +892,6 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - reset_reclaim_mode(sc); continue; activate_locked: @@ -1007,8 +904,6 @@ activate_locked: keep_locked: unlock_page(page); keep: - reset_reclaim_mode(sc); -keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } @@ -1020,7 +915,7 @@ keep_lumpy: * will encounter the same problem */ if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(mz->zone, ZONE_CONGESTED); + zone_set_flag(zone, ZONE_CONGESTED); free_hot_cold_page_list(&free_pages, 1); @@ -1041,34 +936,15 @@ keep_lumpy: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode) { - bool all_lru_mode; int ret = -EINVAL; /* Only take pages on the LRU. */ if (!PageLRU(page)) return ret; - all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == - (ISOLATE_ACTIVE|ISOLATE_INACTIVE); - - /* - * When checking the active state, we need to be sure we are - * dealing with comparible boolean values. Take the logical not - * of each. - */ - if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) - return ret; - - if (!all_lru_mode && !!page_is_file_cache(page) != file) - return ret; - - /* - * When this function is being called for lumpy reclaim, we - * initially look into all LRU pages, active, inactive and - * unevictable; only give shrink_page_list evictable pages. - */ + /* Do not give back unevictable pages for compaction */ if (PageUnevictable(page)) return ret; @@ -1135,54 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @mz: The mem_cgroup_zone to pull pages from. + * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @mode: One of the LRU isolation modes - * @active: True [1] if isolating active pages - * @file: True [1] if isolating file [!anon] pages + * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct mem_cgroup_zone *mz, struct list_head *dst, + struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, - isolate_mode_t mode, int active, int file) + isolate_mode_t mode, enum lru_list lru) { - struct lruvec *lruvec; - struct list_head *src; + struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long nr_lumpy_taken = 0; - unsigned long nr_lumpy_dirty = 0; - unsigned long nr_lumpy_failed = 0; unsigned long scan; - int lru = LRU_BASE; - - lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - src = &lruvec->lists[lru]; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; - unsigned long pfn; - unsigned long end_pfn; - unsigned long page_pfn; - int zone_id; + int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); VM_BUG_ON(!PageLRU(page)); - switch (__isolate_lru_page(page, mode, file)) { + switch (__isolate_lru_page(page, mode)) { case 0: - mem_cgroup_lru_del(page); + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_move(&page->lru, dst); - nr_taken += hpage_nr_pages(page); + nr_taken += nr_pages; break; case -EBUSY: @@ -1193,93 +1054,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, default: BUG(); } - - if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) - continue; - - /* - * Attempt to take all pages in the order aligned region - * surrounding the tag page. Only take those pages of - * the same active state as that tag page. We may safely - * round the target page pfn down to the requested order - * as the mem_map is guaranteed valid out to MAX_ORDER, - * where that page is in a different zone we will detect - * it from its zone id and abort this block scan. - */ - zone_id = page_zone_id(page); - page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << sc->order) - 1); - end_pfn = pfn + (1 << sc->order); - for (; pfn < end_pfn; pfn++) { - struct page *cursor_page; - - /* The target page is in the block, ignore it. */ - if (unlikely(pfn == page_pfn)) - continue; - - /* Avoid holes within the zone. */ - if (unlikely(!pfn_valid_within(pfn))) - break; - - cursor_page = pfn_to_page(pfn); - - /* Check that we have not crossed a zone boundary. */ - if (unlikely(page_zone_id(cursor_page) != zone_id)) - break; - - /* - * If we don't have enough swap space, reclaiming of - * anon page which don't already have a swap slot is - * pointless. - */ - if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && - !PageSwapCache(cursor_page)) - break; - - if (__isolate_lru_page(cursor_page, mode, file) == 0) { - unsigned int isolated_pages; - - mem_cgroup_lru_del(cursor_page); - list_move(&cursor_page->lru, dst); - isolated_pages = hpage_nr_pages(cursor_page); - nr_taken += isolated_pages; - nr_lumpy_taken += isolated_pages; - if (PageDirty(cursor_page)) - nr_lumpy_dirty += isolated_pages; - scan++; - pfn += isolated_pages - 1; - } else { - /* - * Check if the page is freed already. - * - * We can't use page_count() as that - * requires compound_head and we don't - * have a pin on the page here. If a - * page is tail, we may or may not - * have isolated the head, so assume - * it's not free, it'd be tricky to - * track the head status without a - * page pin. - */ - if (!PageTail(cursor_page) && - !atomic_read(&cursor_page->_count)) - continue; - break; - } - } - - /* If we break out of the loop above, lumpy reclaim failed */ - if (pfn < end_pfn) - nr_lumpy_failed++; } *nr_scanned = scan; - - trace_mm_vmscan_lru_isolate(sc->order, - nr_to_scan, scan, - nr_taken, - nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode, file); + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + nr_taken, mode, is_file_lru(lru)); return nr_taken; } @@ -1316,15 +1095,16 @@ int isolate_lru_page(struct page *page) if (PageLRU(page)) { struct zone *zone = page_zone(page); + struct lruvec *lruvec; spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); if (PageLRU(page)) { int lru = page_lru(page); - ret = 0; get_page(page); ClearPageLRU(page); - - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); + ret = 0; } spin_unlock_irq(&zone->lru_lock); } @@ -1357,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file, } static noinline_for_stack void -putback_inactive_pages(struct mem_cgroup_zone *mz, - struct list_head *page_list) +putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); - struct zone *zone = mz->zone; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + struct zone *zone = lruvec_zone(lruvec); LIST_HEAD(pages_to_free); /* @@ -1379,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); continue; } + + lruvec = mem_cgroup_page_lruvec(page, zone); + SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); + add_page_to_lru_list(page, lruvec, lru); + if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); @@ -1390,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1407,112 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, list_splice(&pages_to_free, page_list); } -static noinline_for_stack void -update_isolated_counts(struct mem_cgroup_zone *mz, - struct list_head *page_list, - unsigned long *nr_anon, - unsigned long *nr_file) -{ - struct zone *zone = mz->zone; - unsigned int count[NR_LRU_LISTS] = { 0, }; - unsigned long nr_active = 0; - struct page *page; - int lru; - - /* - * Count pages and clear active flags - */ - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - count[lru] += numpages; - } - - preempt_disable(); - __count_vm_events(PGDEACTIVATE, nr_active); - - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); - - *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); - preempt_enable(); -} - -/* - * Returns true if a direct reclaim should wait on pages under writeback. - * - * If we are direct reclaiming for contiguous pages and we do not reclaim - * everything in the list, try again and wait for writeback IO to complete. - * This will stall high-order allocations noticeably. Only do that when really - * need to free the pages under high memory pressure. - */ -static inline bool should_reclaim_stall(unsigned long nr_taken, - unsigned long nr_freed, - int priority, - struct scan_control *sc) -{ - int lumpy_stall_priority; - - /* kswapd should not stall on sync IO */ - if (current_is_kswapd()) - return false; - - /* Only stall on lumpy reclaim */ - if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) - return false; - - /* If we have reclaimed everything on the isolated list, no stall */ - if (nr_freed == nr_taken) - return false; - - /* - * For high-order allocations, there are two stall thresholds. - * High-cost allocations stall immediately where as lower - * order allocations such as stacks require the scanning - * priority to be much higher before stalling. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_stall_priority = DEF_PRIORITY; - else - lumpy_stall_priority = DEF_PRIORITY / 3; - - return priority <= lumpy_stall_priority; -} - /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) { LIST_HEAD(page_list); unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; - isolate_mode_t isolate_mode = ISOLATE_INACTIVE; - struct zone *zone = mz->zone; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1522,10 +1217,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, return SWAP_CLUSTER_MAX; } - set_reclaim_mode(priority, sc, false); - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - isolate_mode |= ISOLATE_ACTIVE; - lru_add_drain(); if (!sc->may_unmap) @@ -1535,38 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, - sc, isolate_mode, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + &nr_scanned, sc, isolate_mode, lru); + + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scanned); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scanned); + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) return 0; - update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); - - nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, + nr_reclaimed = shrink_page_list(&page_list, zone, sc, &nr_dirty, &nr_writeback); - /* Check if we should syncronously wait for writeback */ - if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, mz, sc, - priority, &nr_dirty, &nr_writeback); - } - spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[0] += nr_anon; - reclaim_stat->recent_scanned[1] += nr_file; + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { if (current_is_kswapd()) @@ -1577,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, nr_reclaimed); } - putback_inactive_pages(mz, &page_list); + putback_inactive_pages(lruvec, &page_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1609,14 +1291,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any * isolated page is PageWriteback */ - if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) + if (nr_writeback && nr_writeback >= + (nr_taken >> (DEF_PRIORITY - sc->priority))) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, - priority, - trace_shrink_flags(file, sc->reclaim_mode)); + sc->priority, + trace_shrink_flags(file)); return nr_reclaimed; } @@ -1638,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, * But we had to alter page->flags anyway. */ -static void move_active_pages_to_lru(struct zone *zone, +static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, struct list_head *pages_to_free, enum lru_list lru) { + struct zone *zone = lruvec_zone(lruvec); unsigned long pgmoved = 0; struct page *page; + int nr_pages; while (!list_empty(list)) { - struct lruvec *lruvec; - page = lru_to_page(list); + lruvec = mem_cgroup_page_lruvec(page, zone); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - lruvec = mem_cgroup_lru_add_list(zone, page, lru); + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); - pgmoved += hpage_nr_pages(page); + pgmoved += nr_pages; if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1677,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone, } static void shrink_active_list(unsigned long nr_to_scan, - struct mem_cgroup_zone *mz, + struct lruvec *lruvec, struct scan_control *sc, - int priority, int file) + enum lru_list lru) { unsigned long nr_taken; unsigned long nr_scanned; @@ -1688,15 +1373,14 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned long nr_rotated = 0; - isolate_mode_t isolate_mode = ISOLATE_ACTIVE; - struct zone *zone = mz->zone; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); lru_add_drain(); - reset_reclaim_mode(sc); - if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) @@ -1704,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, - isolate_mode, 1, file); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, nr_scanned); - if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); - else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1737,7 +1418,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } } - if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, sc->target_mem_cgroup, + &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1770,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, &l_hold, - LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, &l_hold, - LRU_BASE + file * LRU_FILE); + move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1796,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone) /** * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @lruvec: LRU vector to check * * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct mem_cgroup_zone *mz) +static int inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation @@ -1811,14 +1490,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz) if (!total_swap_pages) return 0; - if (!scanning_global_lru(mz)) - return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, - mz->zone); + if (!mem_cgroup_disabled()) + return mem_cgroup_inactive_anon_is_low(lruvec); - return inactive_anon_is_low_global(mz->zone); + return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) +static inline int inactive_anon_is_low(struct lruvec *lruvec) { return 0; } @@ -1836,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone) /** * inactive_file_is_low - check if file pages need to be deactivated - * @mz: memory cgroup and zone to check + * @lruvec: LRU vector to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1848,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct mem_cgroup_zone *mz) +static int inactive_file_is_low(struct lruvec *lruvec) { - if (!scanning_global_lru(mz)) - return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, - mz->zone); + if (!mem_cgroup_disabled()) + return mem_cgroup_inactive_file_is_low(lruvec); - return inactive_file_is_low_global(mz->zone); + return inactive_file_is_low_global(lruvec_zone(lruvec)); } -static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) +static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { - if (file) - return inactive_file_is_low(mz); + if (is_file_lru(lru)) + return inactive_file_is_low(lruvec); else - return inactive_anon_is_low(mz); + return inactive_anon_is_low(lruvec); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct mem_cgroup_zone *mz, - struct scan_control *sc, int priority) + struct lruvec *lruvec, struct scan_control *sc) { - int file = is_file_lru(lru); - if (is_active_lru(lru)) { - if (inactive_list_is_low(mz, file)) - shrink_active_list(nr_to_scan, mz, sc, priority, file); + if (inactive_list_is_low(lruvec, lru)) + shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } - return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct mem_cgroup_zone *mz, - struct scan_control *sc) +static int vmscan_swappiness(struct scan_control *sc) { if (global_reclaim(sc)) return vm_swappiness; - return mem_cgroup_swappiness(mz->mem_cgroup); + return mem_cgroup_swappiness(sc->target_mem_cgroup); } /* @@ -1894,19 +1567,21 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz, * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2], denominator; enum lru_list lru; int noswap = 0; bool force_scan = false; + struct zone *zone = lruvec_zone(lruvec); /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1918,7 +1593,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && mz->zone->all_unreclaimable) + if (current_is_kswapd() && zone->all_unreclaimable) force_scan = true; if (!global_reclaim(sc)) force_scan = true; @@ -1932,16 +1607,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, goto out; } - anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); if (global_reclaim(sc)) { - free = zone_page_state(mz->zone, NR_FREE_PAGES); + free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= high_wmark_pages(mz->zone))) { + if (unlikely(file + free <= high_wmark_pages(zone))) { fraction[0] = 1; fraction[1] = 0; denominator = 1; @@ -1953,8 +1628,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(mz, sc); - file_prio = 200 - vmscan_swappiness(mz, sc); + anon_prio = vmscan_swappiness(sc); + file_prio = 200 - anon_prio; /* * OK, so we have swap space and a fair amount of page cache @@ -1967,7 +1642,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * * anon in [0], file in [1] */ - spin_lock_irq(&mz->zone->lru_lock); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -1983,12 +1658,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&mz->zone->lru_lock); + spin_unlock_irq(&zone->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -1998,9 +1673,9 @@ out: int file = is_file_lru(lru); unsigned long scan; - scan = zone_nr_lru_pages(mz, lru); - if (priority || noswap) { - scan >>= priority; + scan = get_lru_size(lruvec, lru); + if (sc->priority || noswap || !vmscan_swappiness(sc)) { + scan >>= sc->priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); @@ -2009,14 +1684,25 @@ out: } } +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ + if (COMPACTION_BUILD && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + sc->priority < DEF_PRIORITY - 2)) + return true; + + return false; +} + /* - * Reclaim/compaction depends on a number of pages being freed. To avoid - * disruption to the system, a small number of order-0 pages continue to be - * rotated and reclaimed in the normal fashion. However, by the time we get - * back to the allocator and call try_to_compact_zone(), we ensure that - * there are enough free pages for it to be likely successful + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, +static inline bool should_continue_reclaim(struct lruvec *lruvec, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -2025,7 +1711,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long inactive_lru_pages; /* If not in reclaim/compaction mode, stop */ - if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + if (!in_reclaim_compaction(sc)) return false; /* Consider stopping depending on scan and reclaim activity */ @@ -2056,15 +1742,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); + inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) - inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); + inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(mz->zone, sc->order)) { + switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2076,8 +1762,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, - struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2089,7 +1774,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; - get_scan_count(mz, sc, nr, priority); + get_scan_count(lruvec, sc, nr); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -2101,7 +1786,7 @@ restart: nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - mz, sc, priority); + lruvec, sc); } } /* @@ -2112,7 +1797,8 @@ restart: * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) + if (nr_reclaimed >= nr_to_reclaim && + sc->priority < DEF_PRIORITY) break; } blk_finish_plug(&plug); @@ -2122,35 +1808,33 @@ restart: * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(mz)) - shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(mz, nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)) + if (should_continue_reclaim(lruvec, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) goto restart; throttle_vm_writeout(sc->gfp_mask); } -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { struct mem_cgroup *root = sc->target_mem_cgroup; struct mem_cgroup_reclaim_cookie reclaim = { .zone = zone, - .priority = priority, + .priority = sc->priority, }; struct mem_cgroup *memcg; memcg = mem_cgroup_iter(root, NULL, &reclaim); do { - struct mem_cgroup_zone mz = { - .mem_cgroup = memcg, - .zone = zone, - }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + shrink_lruvec(lruvec, sc); - shrink_mem_cgroup_zone(priority, &mz, sc); /* * Limit reclaim has historically picked one memcg and * scanned it with decreasing priority levels until @@ -2226,8 +1910,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * the caller that it should consider retrying the allocation instead of * further reclaim. */ -static bool shrink_zones(int priority, struct zonelist *zonelist, - struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2254,7 +1937,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && + sc->priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ if (COMPACTION_BUILD) { /* @@ -2286,7 +1970,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, /* need some check for avoid more shrink_zone() */ } - shrink_zone(priority, zone, sc); + shrink_zone(zone, sc); } return aborted_reclaim; @@ -2337,7 +2021,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc, struct shrink_control *shrink) { - int priority; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; struct zoneref *z; @@ -2350,11 +2033,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + do { sc->nr_scanned = 0; - if (!priority) - disable_swap_token(sc->target_mem_cgroup); - aborted_reclaim = shrink_zones(priority, zonelist, sc); + aborted_reclaim = shrink_zones(zonelist, sc); /* * Don't shrink slabs when reclaiming memory from @@ -2396,7 +2077,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) { + sc->priority < DEF_PRIORITY - 2) { struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), @@ -2404,7 +2085,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } - } + } while (--sc->priority >= 0); out: delayacct_freepages_end(); @@ -2442,6 +2123,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_unmap = 1, .may_swap = 1, .order = order, + .priority = DEF_PRIORITY, .target_mem_cgroup = NULL, .nodemask = nodemask, }; @@ -2474,17 +2156,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !noswap, .order = 0, + .priority = 0, .target_mem_cgroup = memcg, }; - struct mem_cgroup_zone mz = { - .mem_cgroup = memcg, - .zone = zone, - }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.may_writepage, sc.gfp_mask); @@ -2495,7 +2175,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_mem_cgroup_zone(0, &mz, &sc); + shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2516,6 +2196,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, + .priority = DEF_PRIORITY, .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2546,8 +2227,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct zone *zone, struct scan_control *sc, - int priority) +static void age_active_anon(struct zone *zone, struct scan_control *sc) { struct mem_cgroup *memcg; @@ -2556,14 +2236,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct mem_cgroup_zone mz = { - .mem_cgroup = memcg, - .zone = zone, - }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - if (inactive_anon_is_low(&mz)) - shrink_active_list(SWAP_CLUSTER_MAX, &mz, - sc, priority, 0); + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); @@ -2672,7 +2349,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int all_zones_ok; unsigned long balanced; - int priority; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; @@ -2696,18 +2372,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, }; loop_again: total_scanned = 0; + sc.priority = DEF_PRIORITY; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + do { unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; - /* The swap token gets in the way of swapout... */ - if (!priority) - disable_swap_token(NULL); - all_zones_ok = 1; balanced = 0; @@ -2721,14 +2394,15 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && + sc.priority != DEF_PRIORITY) continue; /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - age_active_anon(zone, &sc, priority); + age_active_anon(zone, &sc); /* * If the number of buffer_heads in the machine @@ -2776,7 +2450,8 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && + sc.priority != DEF_PRIORITY) continue; sc.nr_scanned = 0; @@ -2820,7 +2495,7 @@ loop_again: !zone_watermark_ok_safe(zone, testorder, high_wmark_pages(zone) + balance_gap, end_zone, 0)) { - shrink_zone(priority, zone, &sc); + shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); @@ -2863,7 +2538,7 @@ loop_again: * consider it to be no longer congested. It's * possible there are dirty pages backed by * congested BDIs but as pressure is relieved, - * spectulatively avoid congestion waits + * speculatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); if (i <= *classzone_idx) @@ -2877,7 +2552,7 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { if (has_under_min_watermark_zone) count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); else @@ -2892,7 +2567,7 @@ loop_again: */ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; - } + } while (--sc.priority >= 0); out: /* @@ -2942,7 +2617,8 @@ out: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && + sc.priority != DEF_PRIORITY) continue; /* Would compaction fail due to lack of free memory? */ @@ -3013,7 +2689,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - schedule(); + + if (!kthread_should_stop()) + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) @@ -3209,6 +2888,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, .order = 0, + .priority = DEF_PRIORITY, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -3279,14 +2959,17 @@ int kswapd_run(int nid) } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold lock_memory_hotplug(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) @@ -3386,7 +3069,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), @@ -3395,6 +3077,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .order = order, + .priority = ZONE_RECLAIM_PRIORITY, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -3417,11 +3100,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ - priority = ZONE_RECLAIM_PRIORITY; do { - shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && sc.nr_reclaimed < nr_pages); + shrink_zone(zone, &sc); + } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -3536,7 +3217,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) if (mapping_unevictable(page_mapping(page))) return 0; - if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) return 0; return 1; @@ -3572,6 +3253,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) zone = pagezone; spin_lock_irq(&zone->lru_lock); } + lruvec = mem_cgroup_page_lruvec(page, zone); if (!PageLRU(page) || !PageUnevictable(page)) continue; @@ -3581,11 +3263,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) VM_BUG_ON(PageActive(page)); ClearPageUnevictable(page); - __dec_zone_state(zone, NR_UNEVICTABLE); - lruvec = mem_cgroup_lru_move_lists(zone, page, - LRU_UNEVICTABLE, lru); - list_move(&page->lru, &lruvec->lists[lru]); - __inc_zone_state(zone, NR_INACTIVE_ANON + lru); + del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, lru); pgrescued++; } } diff --git a/mm/vmstat.c b/mm/vmstat.c index 7db1b9bab492..1bbbbd9776ad 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Reclaimable", "Movable", "Reserve", +#ifdef CONFIG_CMA + "CMA", +#endif "Isolate", }; @@ -1220,7 +1223,6 @@ module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) #include <linux/debugfs.h> -static struct dentry *extfrag_debug_root; /* * Return an index indicating how much of the available free memory is @@ -1358,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = { static int __init extfrag_debug_init(void) { + struct dentry *extfrag_debug_root; + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); if (!extfrag_debug_root) return -ENOMEM; if (!debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, &unusable_file_ops)) - return -ENOMEM; + goto fail; if (!debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, &extfrag_file_ops)) - return -ENOMEM; + goto fail; return 0; +fail: + debugfs_remove_recursive(extfrag_debug_root); + return -ENOMEM; } module_init(extfrag_debug_init); |