diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 9 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/filemap.c | 40 | ||||
-rw-r--r-- | mm/filemap.h | 4 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/highmem.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 127 | ||||
-rw-r--r-- | mm/mempolicy.c | 12 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mmzone.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 97 | ||||
-rw-r--r-- | mm/page_alloc.c | 483 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 14 | ||||
-rw-r--r-- | mm/slab.c | 145 | ||||
-rw-r--r-- | mm/slob.c | 1 | ||||
-rw-r--r-- | mm/sparse.c | 3 | ||||
-rw-r--r-- | mm/swap.c | 7 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 1 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 107 | ||||
-rw-r--r-- | mm/vmstat.c | 614 |
27 files changed, 979 insertions, 733 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 66e65ab39426..8f5b45615f7b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -115,7 +115,8 @@ config SPARSEMEM_EXTREME # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" - depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG + depends on (IA64 || X86 || PPC64) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND @@ -145,3 +146,9 @@ config MIGRATION while the virtual addresses are not changed. This is useful for example on NUMA systems to put pages nearer to the processors accessing the page. + +config RESOURCES_64BIT + bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) + default 64BIT + help + This option allows memory and IO resources to be 64 bit. diff --git a/mm/Makefile b/mm/Makefile index 0b8f73f2ed16..9dd824c11eeb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ - prio_tree.o util.o mmzone.o $(mmu-y) + prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/filemap.c b/mm/filemap.c index 9c7334bafda8..d087fc3d3281 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -9,7 +9,6 @@ * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ -#include <linux/config.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/compiler.h> @@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; - pagecache_acct(-1); + __dec_zone_page_state(page, NR_FILE_PAGES); } void remove_from_page_cache(struct page *page) @@ -449,7 +448,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, page->mapping = mapping; page->index = offset; mapping->nrpages++; - pagecache_acct(1); + __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); @@ -1416,7 +1415,7 @@ retry_find: */ if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } did_readaround = 1; ra_pages = max_sane_readahead(file->f_ra.ra_pages); @@ -1487,7 +1486,7 @@ no_cached_page: page_not_uptodate: if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } lock_page(page); @@ -2069,7 +2068,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - struct address_space_operations *a_ops = mapping->a_ops; + const struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; struct page *page; @@ -2095,14 +2094,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, do { unsigned long index; unsigned long offset; - unsigned long maxlen; size_t copied; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; + + /* Limit the size of the copy to the caller's write size */ + bytes = min(bytes, count); + + /* + * Limit the size of the copy to that of the current segment, + * because fault_in_pages_readable() doesn't know how to walk + * segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); /* * Bring in the user page that we will copy from _first_. @@ -2110,10 +2116,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * same page as we're writing to, without it being marked * up-to-date. */ - maxlen = cur_iov->iov_len - iov_base; - if (maxlen > bytes) - maxlen = bytes; - fault_in_pages_readable(buf, maxlen); + fault_in_pages_readable(buf, bytes); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { @@ -2121,6 +2124,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, break; } + if (unlikely(bytes == 0)) { + status = 0; + copied = 0; + goto zero_length_segment; + } + status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); @@ -2150,7 +2159,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, page_cache_release(page); continue; } - if (likely(copied > 0)) { +zero_length_segment: + if (likely(copied >= 0)) { if (!status) status = copied; @@ -2215,7 +2225,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; + const struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; diff --git a/mm/filemap.h b/mm/filemap.h index 536979fb4ba7..3f2a343c6015 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -88,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) const struct iovec *iov = *iovp; size_t base = *basep; - while (bytes) { + do { int copy = min(bytes, iov->iov_len - base); bytes -= copy; @@ -97,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) iov++; base = 0; } - } + } while (bytes); *iovp = iov; *basep = base; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b960ac8e5918..b4fd0d7c9bfb 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf, size_t count, loff_t pos, loff_t *ppos) { struct address_space * mapping = filp->f_mapping; - struct address_space_operations *a_ops = mapping->a_ops; + const struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; struct page *page; diff --git a/mm/highmem.c b/mm/highmem.c index 9b274fdf9d08..9b2a5403c447 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -315,8 +315,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) if (bvec->bv_page == org_vec->bv_page) continue; - mempool_free(bvec->bv_page, pool); - dec_page_state(nr_bounce); + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); } bio_endio(bio_orig, bio_orig->bi_size, err); @@ -397,7 +397,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, to->bv_page = mempool_alloc(pool, q->bounce_gfp); to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; - inc_page_state(nr_bounce); + inc_zone_page_state(to->bv_page, NR_BOUNCE); if (rw == WRITE) { char *vto, *vfrom; diff --git a/mm/memory.c b/mm/memory.c index 247b5c312b9b..7e2a4b1580e3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -126,7 +126,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) pmd_clear(pmd); pte_lock_deinit(page); pte_free_tlb(tlb, page); - dec_page_state(nr_page_table_pages); + dec_zone_page_state(page, NR_PAGETABLE); tlb->mm->nr_ptes--; } @@ -311,7 +311,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) pte_free(new); } else { mm->nr_ptes++; - inc_page_state(nr_page_table_pages); + inc_zone_page_state(new, NR_PAGETABLE); pmd_populate(mm, pmd, new); } spin_unlock(&mm->page_table_lock); @@ -1951,7 +1951,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); grab_swap_token(); } @@ -2324,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); - inc_page_state(pgfault); + count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 841a077d5aeb..01c9fb97c619 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -4,7 +4,6 @@ * Copyright (C) */ -#include <linux/config.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/swap.h> @@ -21,6 +20,7 @@ #include <linux/memory_hotplug.h> #include <linux/highmem.h> #include <linux/vmalloc.h> +#include <linux/ioport.h> #include <asm/tlbflush.h> @@ -126,6 +126,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) unsigned long i; unsigned long flags; unsigned long onlined_pages = 0; + struct resource res; + u64 section_end; + unsigned long start_pfn; struct zone *zone; int need_zonelists_rebuild = 0; @@ -148,10 +151,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (!populated_zone(zone)) need_zonelists_rebuild = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pfn_to_page(pfn + i); - online_page(page); - onlined_pages++; + res.start = (u64)pfn << PAGE_SHIFT; + res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; + res.flags = IORESOURCE_MEM; /* we just need system ram */ + section_end = res.end; + + while (find_next_system_ram(&res) >= 0) { + start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); + nr_pages = (unsigned long) + ((res.end + 1 - res.start) >> PAGE_SHIFT); + + if (PageReserved(pfn_to_page(start_pfn))) { + /* this region's page is not onlined now */ + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(start_pfn + i); + online_page(page); + onlined_pages++; + } + } + + res.start = res.end + 1; + res.end = section_end; } zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; @@ -163,3 +183,100 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) vm_total_pages = nr_free_pagecache_pages(); return 0; } + +static pg_data_t *hotadd_new_pgdat(int nid, u64 start) +{ + struct pglist_data *pgdat; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; + unsigned long start_pfn = start >> PAGE_SHIFT; + + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; + + arch_refresh_nodedata(nid, pgdat); + + /* we can use NODE_DATA(nid) from here */ + + /* init node's zones as empty zones, we don't have any present pages.*/ + free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); + + return pgdat; +} + +static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +{ + arch_refresh_nodedata(nid, NULL); + arch_free_nodedata(pgdat); + return; +} + +/* add this memory to iomem resource */ +static void register_memory_resource(u64 start, u64 size) +{ + struct resource *res; + + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(!res); + + res->name = "System RAM"; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (request_resource(&iomem_resource, res) < 0) { + printk("System RAM resource %llx - %llx cannot be added\n", + (unsigned long long)res->start, (unsigned long long)res->end); + kfree(res); + } +} + + + +int add_memory(int nid, u64 start, u64 size) +{ + pg_data_t *pgdat = NULL; + int new_pgdat = 0; + int ret; + + if (!node_online(nid)) { + pgdat = hotadd_new_pgdat(nid, start); + if (!pgdat) + return -ENOMEM; + new_pgdat = 1; + ret = kswapd_run(nid); + if (ret) + goto error; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size); + + if (ret < 0) + goto error; + + /* we online node here. we can't roll back from here. */ + node_set_online(nid); + + if (new_pgdat) { + ret = register_one_node(nid); + /* + * If sysfs file of new node can't create, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + */ + BUG_ON(ret); + } + + /* register this memory as resource */ + register_memory_resource(start, size); + + return ret; +error: + /* rollback pgdat allocation and others */ + if (new_pgdat) + rollback_node_hotadd(nid, pgdat); + + return ret; +} +EXPORT_SYMBOL_GPL(add_memory); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73e0f23b7f51..e07e27e846a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1209,10 +1209,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) { - zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; - put_cpu(); - } + if (page && page_zone(page) == zl->zones[0]) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1821,7 +1819,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; struct file *file = vma->vm_file; @@ -1837,7 +1835,7 @@ int show_numa_map(struct seq_file *m, void *v) return 0; mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + get_vma_policy(priv->task, vma, vma->vm_start)); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1891,7 +1889,7 @@ out: kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 6446c6134b04..c1868ecdbc5f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -96,7 +96,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { unsigned long n; - free = get_page_cache_size(); + free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* diff --git a/mm/mmzone.c b/mm/mmzone.c index b022370e612e..0959ee1a4795 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -5,7 +5,6 @@ */ -#include <linux/config.h> #include <linux/stddef.h> #include <linux/mmzone.h> #include <linux/module.h> diff --git a/mm/nommu.c b/mm/nommu.c index 029fadac0fb5..5151c44a8257 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1122,7 +1122,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { unsigned long n; - free = get_page_cache_size(); + free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8ccf6f1b1473..e630188ccc40 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -99,22 +99,6 @@ EXPORT_SYMBOL(laptop_mode); static void background_writeout(unsigned long _min_pages); -struct writeback_state -{ - unsigned long nr_dirty; - unsigned long nr_unstable; - unsigned long nr_mapped; - unsigned long nr_writeback; -}; - -static void get_writeback_state(struct writeback_state *wbs) -{ - wbs->nr_dirty = read_page_state(nr_dirty); - wbs->nr_unstable = read_page_state(nr_unstable); - wbs->nr_mapped = read_page_state(nr_mapped); - wbs->nr_writeback = read_page_state(nr_writeback); -} - /* * Work out the current dirty-memory clamping and background writeout * thresholds. @@ -133,8 +117,8 @@ static void get_writeback_state(struct writeback_state *wbs) * clamping level. */ static void -get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(long *pbackground, long *pdirty, + struct address_space *mapping) { int background_ratio; /* Percentages */ int dirty_ratio; @@ -144,8 +128,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, unsigned long available_memory = total_pages; struct task_struct *tsk; - get_writeback_state(wbs); - #ifdef CONFIG_HIGHMEM /* * If this mapping can only allocate from low memory, @@ -156,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, #endif - unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; + unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES)) * 100) / + total_pages; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -189,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, */ static void balance_dirty_pages(struct address_space *mapping) { - struct writeback_state wbs; long nr_reclaimable; long background_thresh; long dirty_thresh; @@ -207,11 +190,12 @@ static void balance_dirty_pages(struct address_space *mapping) .range_cyclic = 1, }; - get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - break; + get_dirty_limits(&background_thresh, &dirty_thresh, mapping); + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= + dirty_thresh) + break; if (!dirty_exceeded) dirty_exceeded = 1; @@ -224,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping) */ if (nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - break; + get_dirty_limits(&background_thresh, + &dirty_thresh, mapping); + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + if (nr_reclaimable + + global_page_state(NR_WRITEBACK) + <= dirty_thresh) + break; pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) break; /* We've done our duty */ @@ -236,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping) blk_congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) - dirty_exceeded = 0; + if (nr_reclaimable + global_page_state(NR_WRITEBACK) + <= dirty_thresh && dirty_exceeded) + dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* pdflush is already working this queue */ @@ -299,12 +287,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { - struct writeback_state wbs; long background_thresh; long dirty_thresh; for ( ; ; ) { - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL); /* * Boost the allowable dirty threshold a bit for page @@ -312,8 +299,9 @@ void throttle_vm_writeout(void) */ dirty_thresh += dirty_thresh / 10; /* wheeee... */ - if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) - break; + if (global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK) <= dirty_thresh) + break; blk_congestion_wait(WRITE, HZ/10); } } @@ -336,12 +324,12 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - struct writeback_state wbs; long background_thresh; long dirty_thresh; - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); - if (wbs.nr_dirty + wbs.nr_unstable < background_thresh + get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; wbc.encountered_congestion = 0; @@ -365,12 +353,9 @@ static void background_writeout(unsigned long _min_pages) */ int wakeup_pdflush(long nr_pages) { - if (nr_pages == 0) { - struct writeback_state wbs; - - get_writeback_state(&wbs); - nr_pages = wbs.nr_dirty + wbs.nr_unstable; - } + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); return pdflush_operation(background_writeout, nr_pages); } @@ -401,7 +386,6 @@ static void wb_kupdate(unsigned long arg) unsigned long start_jif; unsigned long next_jif; long nr_to_write; - struct writeback_state wbs; struct writeback_control wbc = { .bdi = NULL, .sync_mode = WB_SYNC_NONE, @@ -414,11 +398,11 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - get_writeback_state(&wbs); oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; next_jif = start_jif + dirty_writeback_interval; - nr_to_write = wbs.nr_dirty + wbs.nr_unstable + + nr_to_write = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { wbc.encountered_congestion = 0; @@ -516,14 +500,14 @@ static void set_ratelimit(void) ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; } -static int +static int __cpuinit ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) { set_ratelimit(); return 0; } -static struct notifier_block ratelimit_nb = { +static struct notifier_block __cpuinitdata ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; @@ -640,7 +624,8 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); if (mapping_cap_account_dirty(mapping)) - inc_page_state(nr_dirty); + __inc_zone_page_state(page, + NR_FILE_DIRTY); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -727,9 +712,9 @@ int test_clear_page_dirty(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping_cap_account_dirty(mapping)) - dec_page_state(nr_dirty); + __dec_zone_page_state(page, NR_FILE_DIRTY); + write_unlock_irqrestore(&mapping->tree_lock, flags); return 1; } write_unlock_irqrestore(&mapping->tree_lock, flags); @@ -760,7 +745,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping) { if (TestClearPageDirty(page)) { if (mapping_cap_account_dirty(mapping)) - dec_page_state(nr_dirty); + dec_zone_page_state(page, NR_FILE_DIRTY); return 1; } return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c1174fcf52c..3e792a583f3b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -14,7 +14,6 @@ * (lots of bits borrowed from Ingo Molnar & Andrew Morton) */ -#include <linux/config.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/swap.h> @@ -266,7 +265,7 @@ static inline void rmv_page_order(struct page *page) * satisfies the following equation: * P = B & ~(1 << O) * - * Assumption: *_mem_map is contigious at least up to MAX_ORDER + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline struct page * __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) @@ -446,8 +445,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); if (!PageHighMem(page)) - mutex_debug_check_no_locks_freed(page_address(page), - PAGE_SIZE<<order); + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE<<order); for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); @@ -456,7 +455,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); - __mod_page_state(pgfree, 1 << order); + __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); local_irq_restore(flags); } @@ -709,27 +708,6 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) -{ -#ifdef CONFIG_NUMA - pg_data_t *pg = z->zone_pgdat; - pg_data_t *orig = zonelist->zones[0]->zone_pgdat; - struct per_cpu_pageset *p; - - p = zone_pcp(z, cpu); - if (pg == orig) { - p->numa_hit++; - } else { - p->numa_miss++; - zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; - } - if (pg == NODE_DATA(numa_node_id())) - p->local_node++; - else - p->other_node++; -#endif -} - /* * Free a 0-order page */ @@ -750,7 +728,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - __inc_page_state(pgfree); + __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) { @@ -826,8 +804,8 @@ again: goto failed; } - __mod_page_state_zone(zone, pgalloc, 1 << order); - zone_statistics(zonelist, zone, cpu); + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(zonelist, zone); local_irq_restore(flags); put_cpu(); @@ -1231,141 +1209,6 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -static DEFINE_PER_CPU(struct page_state, page_states) = {0}; - -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); -#ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif - -static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) -{ - unsigned cpu; - - memset(ret, 0, nr * sizeof(unsigned long)); - cpus_and(*cpumask, *cpumask, cpu_online_map); - - for_each_cpu_mask(cpu, *cpumask) { - unsigned long *in; - unsigned long *out; - unsigned off; - unsigned next_cpu; - - in = (unsigned long *)&per_cpu(page_states, cpu); - - next_cpu = next_cpu(cpu, *cpumask); - if (likely(next_cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, next_cpu)); - - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_page_state_node(struct page_state *ret, int node) -{ - int nr; - cpumask_t mask = node_to_cpumask(node); - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr+1, &mask); -} - -void get_page_state(struct page_state *ret) -{ - int nr; - cpumask_t mask = CPU_MASK_ALL; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr + 1, &mask); -} - -void get_full_page_state(struct page_state *ret) -{ - cpumask_t mask = CPU_MASK_ALL; - - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); -} - -unsigned long read_page_state_offset(unsigned long offset) -{ - unsigned long ret = 0; - int cpu; - - for_each_online_cpu(cpu) { - unsigned long in; - - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); - } - return ret; -} - -void __mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - void *ptr; - - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; -} -EXPORT_SYMBOL(__mod_page_state_offset); - -void mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - unsigned long flags; - void *ptr; - - local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_page_state_offset); - -void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) -{ - struct zone *zones = pgdat->node_zones; - int i; - - *active = 0; - *inactive = 0; - *free = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - *active += zones[i].nr_active; - *inactive += zones[i].nr_inactive; - *free += zones[i].free_pages; - } -} - -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) -{ - struct pglist_data *pgdat; - - *active = 0; - *inactive = 0; - *free = 0; - for_each_online_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); - *active += l; - *inactive += m; - *free += n; - } -} - void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; @@ -1406,7 +1249,6 @@ void si_meminfo_node(struct sysinfo *val, int nid) */ void show_free_areas(void) { - struct page_state ps; int cpu, temperature; unsigned long active; unsigned long inactive; @@ -1438,7 +1280,6 @@ void show_free_areas(void) } } - get_page_state(&ps); get_zone_counts(&active, &inactive, &free); printk("Free pages: %11ukB (%ukB HighMem)\n", @@ -1449,13 +1290,13 @@ void show_free_areas(void) "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", active, inactive, - ps.nr_dirty, - ps.nr_writeback, - ps.nr_unstable, + global_page_state(NR_FILE_DIRTY), + global_page_state(NR_WRITEBACK), + global_page_state(NR_UNSTABLE_NFS), nr_free_pages(), - ps.nr_slab, - ps.nr_mapped, - ps.nr_page_table_pages); + global_page_state(NR_SLAB), + global_page_state(NR_FILE_MAPPED), + global_page_state(NR_PAGETABLE)); for_each_zone(zone) { int i; @@ -2009,7 +1850,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int pageset_cpuup_callback(struct notifier_block *nfb, +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -2031,7 +1872,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb, return ret; } -static struct notifier_block pageset_notifier = +static struct notifier_block __cpuinitdata pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; void __init setup_per_cpu_pageset(void) @@ -2180,6 +2021,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + zap_zone_vm_stats(zone); atomic_set(&zone->reclaim_in_progress, 0); if (!size) continue; @@ -2253,307 +2095,18 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#ifdef CONFIG_PROC_FS - -#include <linux/seq_file.h> - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - for (pgdat = first_online_pgdat(); - pgdat && node; - pgdat = next_online_pgdat(pgdat)) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return next_online_pgdat(pgdat); -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* - * This walks the free areas for each zone. - */ -static int frag_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - int order; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations fragmentation_op = { - .start = frag_start, - .next = frag_next, - .stop = frag_stop, - .show = frag_show, -}; - -/* - * Output information about zones in @pgdat. - */ -static int zoneinfo_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { - int i; - - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); - seq_printf(m, - "\n pages free %lu" - "\n min %lu" - "\n low %lu" - "\n high %lu" - "\n active %lu" - "\n inactive %lu" - "\n scanned %lu (a: %lu i: %lu)" - "\n spanned %lu" - "\n present %lu", - zone->free_pages, - zone->pages_min, - zone->pages_low, - zone->pages_high, - zone->nr_active, - zone->nr_inactive, - zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, - zone->spanned_pages, - zone->present_pages); - seq_printf(m, - "\n protection: (%lu", - zone->lowmem_reserve[0]); - for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); - for_each_online_cpu(i) { - struct per_cpu_pageset *pageset; - int j; - - pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - if (pageset->pcp[j].count) - break; - } - if (j == ARRAY_SIZE(pageset->pcp)) - continue; - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); - } -#ifdef CONFIG_NUMA - seq_printf(m, - "\n numa_hit: %lu" - "\n numa_miss: %lu" - "\n numa_foreign: %lu" - "\n interleave_hit: %lu" - "\n local_node: %lu" - "\n other_node: %lu", - pageset->numa_hit, - pageset->numa_miss, - pageset->numa_foreign, - pageset->interleave_hit, - pageset->local_node, - pageset->other_node); -#endif - } - seq_printf(m, - "\n all_unreclaimable: %u" - "\n prev_priority: %i" - "\n temp_priority: %i" - "\n start_pfn: %lu", - zone->all_unreclaimable, - zone->prev_priority, - zone->temp_priority, - zone->zone_start_pfn); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations zoneinfo_op = { - .start = frag_start, /* iterate over all zones. The same as in - * fragmentation. */ - .next = frag_next, - .stop = frag_stop, - .show = zoneinfo_show, -}; - -static char *vmstat_text[] = { - "nr_dirty", - "nr_writeback", - "nr_unstable", - "nr_page_table_pages", - "nr_mapped", - "nr_slab", - - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - - "pgalloc_high", - "pgalloc_normal", - "pgalloc_dma32", - "pgalloc_dma", - - "pgfree", - "pgactivate", - "pgdeactivate", - - "pgfault", - "pgmajfault", - - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma32", - "pgrefill_dma", - - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma32", - "pgsteal_dma", - - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - "pgscan_kswapd_dma32", - "pgscan_kswapd_dma", - - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma32", - "pgscan_direct_dma", - - "pginodesteal", - "slabs_scanned", - "kswapd_steal", - "kswapd_inodesteal", - "pageoutrun", - "allocstall", - - "pgrotated", - "nr_bounce", -}; - -static void *vmstat_start(struct seq_file *m, loff_t *pos) -{ - struct page_state *ps; - - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) - return ERR_PTR(-ENOMEM); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; -} - -static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -{ - (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - return (unsigned long *)m->private + *pos; -} - -static int vmstat_show(struct seq_file *m, void *arg) -{ - unsigned long *l = arg; - unsigned long off = l - (unsigned long *)m->private; - - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); - return 0; -} - -static void vmstat_stop(struct seq_file *m, void *arg) -{ - kfree(m->private); - m->private = NULL; -} - -struct seq_operations vmstat_op = { - .start = vmstat_start, - .next = vmstat_next, - .stop = vmstat_stop, - .show = vmstat_show, -}; - -#endif /* CONFIG_PROC_FS */ - #ifdef CONFIG_HOTPLUG_CPU static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; - long *count; - unsigned long *src, *dest; if (action == CPU_DEAD) { - int i; - - /* Drain local pagecache count. */ - count = &per_cpu(nr_pagecache_local, cpu); - atomic_add(*count, &nr_pagecache); - *count = 0; local_irq_disable(); __drain_pages(cpu); - - /* Add dead cpu's page_states to our own. */ - dest = (unsigned long *)&__get_cpu_var(page_states); - src = (unsigned long *)&per_cpu(page_states, cpu); - - for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); - i++) { - dest[i] += src[i]; - src[i] = 0; - } - + vm_events_fold_cpu(cpu); local_irq_enable(); + refresh_cpu_vm_stats(cpu); } return NOTIFY_OK; } diff --git a/mm/page_io.c b/mm/page_io.c index bb2b0d53889c..88029948d00a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) } if (wbc->sync_mode == WB_SYNC_ALL) rw |= (1 << BIO_RW_SYNC); - inc_page_state(pswpout); + count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); @@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page) ret = -ENOMEM; goto out; } - inc_page_state(pswpin); + count_vm_event(PSWPIN); submit_bio(READ, bio); out: return ret; diff --git a/mm/readahead.c b/mm/readahead.c index e39e416860d7..aa7ec424656a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -390,8 +390,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' * is set wait till the read completes. Otherwise attempt to read without * blocking. - * Returns 1 meaning 'success' if read is succesfull without switching off - * readhaead mode. Otherwise return failure. + * Returns 1 meaning 'success' if read is successful without switching off + * readahead mode. Otherwise return failure. */ static int blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, diff --git a/mm/rmap.c b/mm/rmap.c index e76909e880ca..40158b59729e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -455,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page, * nr_mapped state can be updated without turning off * interrupts because it is not modified via interrupt. */ - __inc_page_state(nr_mapped); + __inc_zone_page_state(page, NR_ANON_PAGES); } /** @@ -499,7 +499,7 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) - __inc_page_state(nr_mapped); + __inc_zone_page_state(page, NR_FILE_MAPPED); } /** @@ -531,7 +531,8 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - __dec_page_state(nr_mapped); + __dec_zone_page_state(page, + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } } diff --git a/mm/shmem.c b/mm/shmem.c index 38bc3334f263..db21c51531ca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -23,10 +23,8 @@ * which makes it a completely usable filesystem. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/devfs_fs_kernel.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> @@ -174,7 +172,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) } static struct super_operations shmem_ops; -static struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static struct file_operations shmem_file_operations; static struct inode_operations shmem_inode_operations; static struct inode_operations shmem_dir_inode_operations; @@ -1046,12 +1044,12 @@ repeat: swappage = lookup_swap_cache(swap); if (!swappage) { shmem_swp_unmap(entry); - spin_unlock(&info->lock); /* here we actually do the io */ if (type && *type == VM_FAULT_MINOR) { - inc_page_state(pgmajfault); + __count_vm_event(PGMAJFAULT); *type = VM_FAULT_MAJOR; } + spin_unlock(&info->lock); swappage = shmem_swapin(info, swap, idx); if (!swappage) { spin_lock(&info->lock); @@ -2162,7 +2160,7 @@ static void destroy_inodecache(void) printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); } -static struct address_space_operations shmem_aops = { +static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS @@ -2252,9 +2250,7 @@ static int __init init_tmpfs(void) printk(KERN_ERR "Could not register tmpfs\n"); goto out2; } -#ifdef CONFIG_TMPFS - devfs_mk_dir("shm"); -#endif + shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, tmpfs_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { diff --git a/mm/slab.c b/mm/slab.c index 98ac20bc0de9..3936af344542 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -89,6 +89,7 @@ #include <linux/config.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/poison.h> #include <linux/swap.h> #include <linux/cache.h> #include <linux/interrupt.h> @@ -106,6 +107,7 @@ #include <linux/nodemask.h> #include <linux/mempolicy.h> #include <linux/mutex.h> +#include <linux/rtmutex.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -307,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define SIZE_AC 1 #define SIZE_L3 (1 + MAX_NUMNODES) +static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node); +static void enable_cpucache(struct kmem_cache *cachep); +static void cache_reap(void *unused); + /* * This function must be completely optimized away if a constant is passed to * it. Mostly the same as what is in linux/slab.h except it returns an index. @@ -454,7 +463,7 @@ struct kmem_cache { #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_INC_REAPED(x) ((x)->reaped++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -478,7 +487,7 @@ struct kmem_cache { #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_INC_REAPED(x) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -492,17 +501,6 @@ struct kmem_cache { #endif #if DEBUG -/* - * Magic nums for obj red zoning. - * Placed in the first word before and the first word after an obj. - */ -#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ -#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ - -/* ...and for poisoning */ -#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ -#define POISON_FREE 0x6b /* for use-after-free poisoning */ -#define POISON_END 0xa5 /* end-byte of poisoning */ /* * memory layout of objects: @@ -709,12 +707,6 @@ int slab_is_available(void) static DEFINE_PER_CPU(struct work_struct, reap_work); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); -static void enable_cpucache(struct kmem_cache *cachep); -static void cache_reap(void *unused); -static int __node_shrink(struct kmem_cache *cachep, int node); - static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { return cachep->array[smp_processor_id()]; @@ -1083,7 +1075,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) #endif -static int cpuup_callback(struct notifier_block *nfb, +static int __devinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1250,10 +1242,7 @@ free_array_cache: l3 = cachep->nodelists[node]; if (!l3) continue; - spin_lock_irq(&l3->list_lock); - /* free slabs belonging to this node */ - __node_shrink(cachep, node); - spin_unlock_irq(&l3->list_lock); + drain_freelist(cachep, l3, l3->free_objects); } mutex_unlock(&cache_chain_mutex); break; @@ -1265,7 +1254,9 @@ bad: return NOTIFY_BAD; } -static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; +static struct notifier_block __cpuinitdata cpucache_notifier = { + &cpuup_callback, NULL, 0 +}; /* * swap the static kmem_list3 with kmalloced memory @@ -1514,7 +1505,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) atomic_add(nr_pages, &slab_reclaim_pages); - add_page_state(nr_slab, nr_pages); + add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); return page_address(page); @@ -1529,12 +1520,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; + sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); while (i--) { BUG_ON(!PageSlab(page)); __ClearPageSlab(page); page++; } - sub_page_state(nr_slab, nr_freed); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; free_pages((unsigned long)addr, cachep->gfporder); @@ -2255,32 +2246,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep) } } -static int __node_shrink(struct kmem_cache *cachep, int node) +/* + * Remove slabs from the list of free slabs. + * Specify the number of slabs to drain in tofree. + * + * Returns the actual number of slabs released. + */ +static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree) { + struct list_head *p; + int nr_freed; struct slab *slabp; - struct kmem_list3 *l3 = cachep->nodelists[node]; - int ret; - for (;;) { - struct list_head *p; + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { + spin_lock_irq(&l3->list_lock); p = l3->slabs_free.prev; - if (p == &l3->slabs_free) - break; + if (p == &l3->slabs_free) { + spin_unlock_irq(&l3->list_lock); + goto out; + } - slabp = list_entry(l3->slabs_free.prev, struct slab, list); + slabp = list_entry(p, struct slab, list); #if DEBUG BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - - l3->free_objects -= cachep->num; + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. + */ + l3->free_objects -= cache->num; spin_unlock_irq(&l3->list_lock); - slab_destroy(cachep, slabp); - spin_lock_irq(&l3->list_lock); + slab_destroy(cache, slabp); + nr_freed++; } - ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); - return ret; +out: + return nr_freed; } static int __cache_shrink(struct kmem_cache *cachep) @@ -2293,11 +2297,13 @@ static int __cache_shrink(struct kmem_cache *cachep) check_irq_on(); for_each_online_node(i) { l3 = cachep->nodelists[i]; - if (l3) { - spin_lock_irq(&l3->list_lock); - ret += __node_shrink(cachep, i); - spin_unlock_irq(&l3->list_lock); - } + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + ret += !list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial); } return (ret ? 1 : 0); } @@ -3405,7 +3411,7 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); - mutex_debug_check_no_locks_freed(objp, obj_size(c)); + debug_check_no_locks_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); } @@ -3701,10 +3707,6 @@ static void cache_reap(void *unused) } list_for_each_entry(searchp, &cache_chain, next) { - struct list_head *p; - int tofree; - struct slab *slabp; - check_irq_on(); /* @@ -3729,47 +3731,22 @@ static void cache_reap(void *unused) drain_array(searchp, l3, l3->shared, 0, node); - if (l3->free_touched) { + if (l3->free_touched) l3->free_touched = 0; - goto next; - } - - tofree = (l3->free_limit + 5 * searchp->num - 1) / - (5 * searchp->num); - do { - /* - * Do not lock if there are no free blocks. - */ - if (list_empty(&l3->slabs_free)) - break; - - spin_lock_irq(&l3->list_lock); - p = l3->slabs_free.next; - if (p == &(l3->slabs_free)) { - spin_unlock_irq(&l3->list_lock); - break; - } + else { + int freed; - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); - list_del(&slabp->list); - STATS_INC_REAPED(searchp); - - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. searchp cannot disappear, we hold - * cache_chain_lock - */ - l3->free_objects -= searchp->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(searchp, slabp); - } while (--tofree > 0); + freed = drain_freelist(searchp, l3, (l3->free_limit + + 5 * searchp->num - 1) / (5 * searchp->num)); + STATS_ADD_REAPED(searchp, freed); + } next: cond_resched(); } check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); + refresh_cpu_vm_stats(smp_processor_id()); /* Set up the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } diff --git a/mm/slob.c b/mm/slob.c index a68255ba4553..7b52b20b9607 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -29,7 +29,6 @@ * essentially no allocation space overhead. */ -#include <linux/config.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/cache.h> diff --git a/mm/sparse.c b/mm/sparse.c index e0a3fe48aa37..86c52ab80878 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -1,7 +1,6 @@ /* * sparse memory mappings. */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/bootmem.h> @@ -45,7 +44,7 @@ static struct mem_section *sparse_index_alloc(int nid) static int sparse_index_init(unsigned long section_nr, int nid) { - static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; int ret = 0; diff --git a/mm/swap.c b/mm/swap.c index 03ae2076f92f..8fd095c4ae51 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -86,9 +86,8 @@ int rotate_reclaimable_page(struct page *page) zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { - list_del(&page->lru); - list_add_tail(&page->lru, &zone->inactive_list); - inc_page_state(pgrotated); + list_move_tail(&page->lru, &zone->inactive_list); + __count_vm_event(PGROTATED); } if (!test_clear_page_writeback(page)) BUG(); @@ -108,7 +107,7 @@ void fastcall activate_page(struct page *page) del_page_from_inactive_list(zone, page); SetPageActive(page); add_page_to_active_list(zone, page); - inc_page_state(pgactivate); + __count_vm_event(PGACTIVATE); } spin_unlock_irq(&zone->lru_lock); } diff --git a/mm/swap_state.c b/mm/swap_state.c index e0e1583f32c2..fccbd9bba77b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -24,7 +24,7 @@ * vmscan's shrink_list, to make sync_page look nicer, and to allow * future use of radix_tree tags in the swap cache. */ -static struct address_space_operations swap_aops = { +static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .sync_page = block_sync_page, .set_page_dirty = __set_page_dirty_nobuffers, @@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, SetPageSwapCache(page); set_page_private(page, entry.val); total_swapcache_pages++; - pagecache_acct(1); + __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); @@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page) set_page_private(page, 0); ClearPageSwapCache(page); total_swapcache_pages--; - pagecache_acct(-1); + __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } diff --git a/mm/swapfile.c b/mm/swapfile.c index cc367f7e75d8..e70d6c6d6fee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -5,7 +5,6 @@ * Swap reorganised 29.12.95, Stephen Tweedie */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index f9d6a9cc91c4..5f2cbf0f153c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/init.h> -#include <linux/devfs_fs_kernel.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/file.h> @@ -33,9 +32,6 @@ static int __init init_tmpfs(void) { BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); -#ifdef CONFIG_TMPFS - devfs_mk_dir("shm"); -#endif shm_mnt = kern_mount(&tmpfs_fs_type); BUG_ON(IS_ERR(shm_mnt)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 72babac71dea..ff2ebe9458a3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -34,6 +34,7 @@ #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> +#include <linux/kthread.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -46,8 +47,6 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; - unsigned long nr_mapped; /* From page_state */ - /* This context's GFP mask */ gfp_t gfp_mask; @@ -216,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - mod_page_state(slabs_scanned, this_scan); + count_vm_events(SLABS_SCANNED, this_scan); total_scan -= this_scan; cond_resched(); @@ -570,7 +569,7 @@ keep: list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); - mod_page_state(pgactivate, pgactivate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -660,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { - __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - __mod_page_state(kswapd_steal, nr_freed); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + __count_vm_events(KSWAPD_STEAL, nr_freed); } else - __mod_page_state_zone(zone, pgscan_direct, nr_scan); - __mod_page_state_zone(zone, pgsteal, nr_freed); + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + __count_vm_events(PGACTIVATE, nr_freed); if (nr_taken == 0) goto done; @@ -743,7 +742,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * how much memory * is mapped. */ - mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; + mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES)) * 100) / + vm_total_pages; /* * Now decide how much we really want to unmap some pages. The @@ -840,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } zone->nr_active += pgmoved; - spin_unlock(&zone->lru_lock); - __mod_page_state_zone(zone, pgrefill, pgscanned); - __mod_page_state(pgdeactivate, pgdeactivate); - local_irq_enable(); + __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_vm_events(PGDEACTIVATE, pgdeactivate); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); } @@ -976,7 +976,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .swappiness = vm_swappiness, }; - inc_page_state(allocstall); + count_vm_event(ALLOCSTALL); for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; @@ -989,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) } for (priority = DEF_PRIORITY; priority >= 0; priority--) { - sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; if (!priority) disable_swap_token(); @@ -1074,9 +1073,7 @@ loop_again: total_scanned = 0; nr_reclaimed = 0; sc.may_writepage = !laptop_mode; - sc.nr_mapped = read_page_state(nr_mapped); - - inc_page_state(pageoutrun); + count_vm_event(PAGEOUTRUN); for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; @@ -1223,7 +1220,6 @@ static int kswapd(void *p) }; cpumask_t cpumask; - daemonize("kswapd%d", pgdat->node_id); cpumask = node_to_cpumask(pgdat->node_id); if (!cpus_empty(cpumask)) set_cpus_allowed(tsk, cpumask); @@ -1365,7 +1361,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for_each_zone(zone) lru_pages += zone->nr_active + zone->nr_inactive; - nr_slab = read_page_state(nr_slab); + nr_slab = global_page_state(NR_SLAB); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; @@ -1407,9 +1403,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; - sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); if (ret >= nr_pages) goto out; @@ -1450,7 +1444,7 @@ out: not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { pg_data_t *pgdat; @@ -1468,20 +1462,35 @@ static int cpu_callback(struct notifier_block *nfb, } #endif /* CONFIG_HOTPLUG_CPU */ +/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ +int kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kswapd) + return 0; + + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; + } + return ret; +} + static int __init kswapd_init(void) { - pg_data_t *pgdat; + int nid; swap_setup(); - for_each_online_pgdat(pgdat) { - pid_t pid; - - pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); - BUG_ON(pid < 0); - read_lock(&tasklist_lock); - pgdat->kswapd = find_task_by_pid(pid); - read_unlock(&tasklist_lock); - } + for_each_online_node(nid) + kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; } @@ -1508,11 +1517,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ /* - * Mininum time between zone reclaim scans - */ -int zone_reclaim_interval __read_mostly = 30*HZ; - -/* * Priority for ZONE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. @@ -1533,7 +1537,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .nr_mapped = read_page_state(nr_mapped), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, @@ -1578,16 +1581,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - - if (nr_reclaimed == 0) { - /* - * We were unable to reclaim enough pages to stay on node. We - * now allow off node accesses for a certain time period before - * trying again to reclaim pages from the local zone. - */ - zone->last_unsuccessful_zone_reclaim = jiffies; - } - return nr_reclaimed >= nr_pages; } @@ -1597,13 +1590,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; /* - * Do not reclaim if there was a recent unsuccessful attempt at zone - * reclaim. In that case we let allocations go off node for the - * zone_reclaim_interval. Otherwise we would scan for each off-node - * page allocation. + * Do not reclaim if there are not enough reclaimable pages in this + * zone that would satify this allocations. + * + * All unmapped pagecache pages are reclaimable. + * + * Both counters may be temporarily off a bit so we use + * SWAP_CLUSTER_MAX as the boundary. It may also be good to + * leave a few frequently used unmapped pagecache pages around. */ - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) return 0; /* diff --git a/mm/vmstat.c b/mm/vmstat.c new file mode 100644 index 000000000000..73b83d67bab6 --- /dev/null +++ b/mm/vmstat.c @@ -0,0 +1,614 @@ +/* + * linux/mm/vmstat.c + * + * Manages VM statistics + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * zoned VM statistics + * Copyright (C) 2006 Silicon Graphics, Inc., + * Christoph Lameter <christoph@lameter.com> + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_online_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} + +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) +{ + int cpu = 0; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + cpu = first_cpu(*cpumask); + while (cpu < NR_CPUS) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + cpu = next_cpu(cpu, *cpumask); + + if (cpu < NR_CPUS) + prefetch(&per_cpu(vm_event_states, cpu)); + + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} + +/* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + sum_vm_events(ret, &cpu_online_map); +} + +#ifdef CONFIG_HOTPLUG +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} +#endif /* CONFIG_HOTPLUG */ + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +/* + * Manage combined zone based / global counters + * + * vm_stat contains the global counters + */ +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +EXPORT_SYMBOL(vm_stat); + +#ifdef CONFIG_SMP + +#define STAT_THRESHOLD 32 + +/* + * Determine pointer to currently valid differential byte given a zone and + * the item number. + * + * Preemption must be off + */ +static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) +{ + return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; +} + +/* + * For use when we know that interrupts are disabled. + */ +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + s8 *p; + long x; + + p = diff_pointer(zone, item); + x = delta + *p; + + if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { + zone_page_state_add(x, zone, item); + x = 0; + } + + *p = x; +} +EXPORT_SYMBOL(__mod_zone_page_state); + +/* + * For an unknown interrupt state + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + +/* + * Optimized increment and decrement functions. + * + * These are only for a single page and therefore can take a struct page * + * argument instead of struct zone *. This allows the inclusion of the code + * generated for page_zone(page) into the optimized functions. + * + * No overflow check is necessary and therefore the differential can be + * incremented or decremented in place which may allow the compilers to + * generate better code. + * + * The increment or decrement is known and therefore one boundary check can + * be omitted. + * + * Some processors have inc/dec instructions that are atomic vs an interrupt. + * However, the code must first determine the differential location in a zone + * based on the processor number and then inc/dec the counter. There is no + * guarantee without disabling preemption that the processor will not change + * in between and therefore the atomicity vs. interrupt cannot be exploited + * in a useful way here. + */ +static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + s8 *p = diff_pointer(zone, item); + + (*p)++; + + if (unlikely(*p > STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } +} + +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__inc_zone_page_state); + +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + struct zone *zone = page_zone(page); + s8 *p = diff_pointer(zone, item); + + (*p)--; + + if (unlikely(*p < -STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } +} +EXPORT_SYMBOL(__dec_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + s8 *p; + + zone = page_zone(page); + local_irq_save(flags); + p = diff_pointer(zone, item); + + (*p)--; + + if (unlikely(*p < -STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_zone_page_state); + +/* + * Update the zone counters for one cpu. + */ +void refresh_cpu_vm_stats(int cpu) +{ + struct zone *zone; + int i; + unsigned long flags; + + for_each_zone(zone) { + struct per_cpu_pageset *pcp; + + pcp = zone_pcp(zone, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pcp->vm_stat_diff[i]) { + local_irq_save(flags); + zone_page_state_add(pcp->vm_stat_diff[i], + zone, i); + pcp->vm_stat_diff[i] = 0; + local_irq_restore(flags); + } + } +} + +static void __refresh_cpu_vm_stats(void *dummy) +{ + refresh_cpu_vm_stats(smp_processor_id()); +} + +/* + * Consolidate all counters. + * + * Note that the result is less inaccurate but still inaccurate + * if concurrent processes are allowed to run. + */ +void refresh_vm_stats(void) +{ + on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); +} +EXPORT_SYMBOL(refresh_vm_stats); + +#endif + +#ifdef CONFIG_NUMA +/* + * zonelist = the list of zones passed to the allocator + * z = the zone from which the allocation occurred. + * + * Must be called with interrupts disabled. + */ +void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ + if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { + __inc_zone_state(z, NUMA_HIT); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); + } + if (z->zone_pgdat == NODE_DATA(numa_node_id())) + __inc_zone_state(z, NUMA_LOCAL); + else + __inc_zone_state(z, NUMA_OTHER); +} +#endif + +#ifdef CONFIG_PROC_FS + +#include <linux/seq_file.h> + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static char *vmstat_text[] = { + /* Zoned VM counters */ + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_slab", + "nr_page_table_pages", + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_bounce", + +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif + +#ifdef CONFIG_VM_EVENT_COUNTERS + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + "pgalloc_dma", + "pgalloc_dma32", + "pgalloc_normal", + "pgalloc_high", + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + "pgrefill_dma", + "pgrefill_dma32", + "pgrefill_normal", + "pgrefill_high", + + "pgsteal_dma", + "pgsteal_dma32", + "pgsteal_normal", + "pgsteal_high", + + "pgscan_kswapd_dma", + "pgscan_kswapd_dma32", + "pgscan_kswapd_normal", + "pgscan_kswapd_high", + + "pgscan_direct_dma", + "pgscan_direct_dma32", + "pgscan_direct_normal", + "pgscan_direct_high", + + "pginodesteal", + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", +#endif +}; + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { + int i; + + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n active %lu" + "\n inactive %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone->free_pages, + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->nr_active, + zone->nr_inactive, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + int j; + + pageset = zone_pcp(zone, i); + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + if (pageset->pcp[j].count) + break; + } + if (j == ARRAY_SIZE(pageset->pcp)) + continue; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].high, + pageset->pcp[j].batch); + } + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, + zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + unsigned long *v; +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *e; +#endif + int i; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + +#ifdef CONFIG_VM_EVENT_COUNTERS + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + sizeof(struct vm_event_state), GFP_KERNEL); +#else + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), + GFP_KERNEL); +#endif + m->private = v; + if (!v) + return ERR_PTR(-ENOMEM); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); +#ifdef CONFIG_VM_EVENT_COUNTERS + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif + return v + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + |