diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/page-writeback.c | 59 | ||||
-rw-r--r-- | mm/page_alloc.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 159 | ||||
-rw-r--r-- | mm/thrash.c | 5 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
5 files changed, 164 insertions, 63 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 63cd88840eb2..eec1481ba44f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -588,31 +588,27 @@ void __init page_writeback_init(void) } /** - * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function * - * This is a library function, which implements the writepages() - * address_space_operation. - * - * If a page is already under I/O, generic_writepages() skips it, even + * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. - * - * Derived from mpage_writepages() - if you fix this you should check that - * also! */ -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; - int (*writepage)(struct page *page, struct writeback_control *wbc); struct pagevec pvec; int nr_pages; pgoff_t index; @@ -625,12 +621,6 @@ int generic_writepages(struct address_space *mapping, return 0; } - writepage = mapping->a_ops->writepage; - - /* deal with chardevs and other special file */ - if (!writepage) - return 0; - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ @@ -682,8 +672,7 @@ retry: continue; } - ret = (*writepage)(page, wbc); - mapping_set_error(mapping, ret); + ret = (*writepage)(page, wbc, data); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) unlock_page(page); @@ -710,6 +699,38 @@ retry: mapping->writeback_index = index; return ret; } +EXPORT_SYMBOL(write_cache_pages); + +/* + * Function used by generic_writepages to call the real writepage + * function and set the mapping flags on error + */ +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +/** + * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + return write_cache_pages(mapping, wbc, __writepage, mapping); +} EXPORT_SYMBOL(generic_writepages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9b5d6d5f4d6..ae96dd844432 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2284,7 +2284,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __init early_pfn_to_nid(unsigned long pfn) +int __meminit early_pfn_to_nid(unsigned long pfn) { int i; diff --git a/mm/slub.c b/mm/slub.c index bd2efae02bcd..b39c8a69a4ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -81,10 +81,14 @@ * PageActive The slab is used as a cpu cache. Allocations * may be performed from the slab. The slab is not * on any slab list and cannot be moved onto one. + * The cpu slab may be equipped with an additioanl + * lockless_freelist that allows lockless access to + * free objects in addition to the regular freelist + * that requires the slab lock. * * PageError Slab requires special handling due to debug * options set. This moves slab handling out of - * the fast path. + * the fast path and disables lockless freelists. */ static inline int SlabDebug(struct page *page) @@ -1014,6 +1018,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; + page->lockless_freelist = NULL; page->inuse = 0; out: if (flags & __GFP_WAIT) @@ -1276,6 +1281,23 @@ static void putback_slab(struct kmem_cache *s, struct page *page) */ static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) { + /* + * Merge cpu freelist into freelist. Typically we get here + * because both freelists are empty. So this is unlikely + * to occur. + */ + while (unlikely(page->lockless_freelist)) { + void **object; + + /* Retrieve object from cpu_freelist */ + object = page->lockless_freelist; + page->lockless_freelist = page->lockless_freelist[page->offset]; + + /* And put onto the regular freelist */ + object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; + } s->cpu_slab[cpu] = NULL; ClearPageActive(page); @@ -1322,47 +1344,46 @@ static void flush_all(struct kmem_cache *s) } /* - * slab_alloc is optimized to only modify two cachelines on the fast path - * (aside from the stack): + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. + * + * Interrupts are disabled. * - * 1. The page struct - * 2. The first cacheline of the object to be allocated. + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. * - * The only other cache lines that are read (apart from code) is the - * per cpu array in the kmem_cache struct. + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. * - * Fastpath is not possible if we need to get a new slab or have - * debugging enabled (which means all slabs are marked with SlabDebug) + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is slowest path since we may sleep. */ -static void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) +static void *__slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr, struct page *page) { - struct page *page; void **object; - unsigned long flags; - int cpu; + int cpu = smp_processor_id(); - local_irq_save(flags); - cpu = smp_processor_id(); - page = s->cpu_slab[cpu]; if (!page) goto new_slab; slab_lock(page); if (unlikely(node != -1 && page_to_nid(page) != node)) goto another_slab; -redo: +load_freelist: object = page->freelist; if (unlikely(!object)) goto another_slab; if (unlikely(SlabDebug(page))) goto debug; -have_object: - page->inuse++; - page->freelist = object[page->offset]; + object = page->freelist; + page->lockless_freelist = object[page->offset]; + page->inuse = s->objects; + page->freelist = NULL; slab_unlock(page); - local_irq_restore(flags); return object; another_slab: @@ -1370,11 +1391,11 @@ another_slab: new_slab: page = get_partial(s, gfpflags, node); - if (likely(page)) { + if (page) { have_slab: s->cpu_slab[cpu] = page; SetPageActive(page); - goto redo; + goto load_freelist; } page = new_slab(s, gfpflags, node); @@ -1397,7 +1418,7 @@ have_slab: discard_slab(s, page); page = s->cpu_slab[cpu]; slab_lock(page); - goto redo; + goto load_freelist; } /* New slab does not fit our expectations */ flush_slab(s, s->cpu_slab[cpu], cpu); @@ -1405,16 +1426,52 @@ have_slab: slab_lock(page); goto have_slab; } - local_irq_restore(flags); return NULL; debug: + object = page->freelist; if (!alloc_object_checks(s, page, object)) goto another_slab; if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); trace(s, page, object, 1); init_object(s, object, 1); - goto have_object; + + page->inuse++; + page->freelist = object[page->offset]; + slab_unlock(page); + return object; +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static void __always_inline *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr) +{ + struct page *page; + void **object; + unsigned long flags; + + local_irq_save(flags); + page = s->cpu_slab[smp_processor_id()]; + if (unlikely(!page || !page->lockless_freelist || + (node != -1 && page_to_nid(page) != node))) + + object = __slab_alloc(s, gfpflags, node, addr, page); + + else { + object = page->lockless_freelist; + page->lockless_freelist = object[page->offset]; + } + local_irq_restore(flags); + return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) @@ -1432,20 +1489,19 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); #endif /* - * The fastpath only writes the cacheline of the page struct and the first - * cacheline of the object. + * Slow patch handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. * - * We read the cpu_slab cacheline to check if the slab is the per cpu - * slab for this processor. + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial page + * handling required then we can return immediately. */ -static void slab_free(struct kmem_cache *s, struct page *page, +static void __slab_free(struct kmem_cache *s, struct page *page, void *x, void *addr) { void *prior; void **object = (void *)x; - unsigned long flags; - local_irq_save(flags); slab_lock(page); if (unlikely(SlabDebug(page))) @@ -1475,7 +1531,6 @@ checks_ok: out_unlock: slab_unlock(page); - local_irq_restore(flags); return; slab_empty: @@ -1487,7 +1542,6 @@ slab_empty: slab_unlock(page); discard_slab(s, page); - local_irq_restore(flags); return; debug: @@ -1502,6 +1556,34 @@ debug: goto checks_ok; } +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + */ +static void __always_inline slab_free(struct kmem_cache *s, + struct page *page, void *x, void *addr) +{ + void **object = (void *)x; + unsigned long flags; + + local_irq_save(flags); + if (likely(page == s->cpu_slab[smp_processor_id()] && + !SlabDebug(page))) { + object[page->offset] = page->lockless_freelist; + page->lockless_freelist = object; + } else + __slab_free(s, page, x, addr); + + local_irq_restore(flags); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { struct page *page; @@ -2363,9 +2445,8 @@ void __init kmem_cache_init(void) register_cpu_notifier(&slab_notifier); #endif - if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ - kmem_size = offsetof(struct kmem_cache, cpu_slab) - + nr_cpu_ids * sizeof(struct page *); + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct page *); printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " Processors=%d, Nodes=%d\n", diff --git a/mm/thrash.c b/mm/thrash.c index 9ef9071f99bc..c4c5205a9c35 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -48,9 +48,8 @@ void grab_swap_token(void) if (current_interval < current->mm->last_interval) current->mm->token_priority++; else { - current->mm->token_priority--; - if (unlikely(current->mm->token_priority < 0)) - current->mm->token_priority = 0; + if (likely(current->mm->token_priority > 0)) + current->mm->token_priority--; } /* Check if we deserve the token */ if (current->mm->token_priority > diff --git a/mm/vmstat.c b/mm/vmstat.c index 9832d9a41d8c..8faf27e5aa98 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -698,7 +698,7 @@ static void __devinit start_cpu_timer(int cpu) { struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK(vmstat_work, vmstat_update); + INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); } |