diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 17 | ||||
-rw-r--r-- | mm/fremap.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 86 | ||||
-rw-r--r-- | mm/memory-failure.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/mempolicy.c | 259 | ||||
-rw-r--r-- | mm/page_alloc.c | 56 | ||||
-rw-r--r-- | mm/pagewalk.c | 375 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/slab.c | 7 | ||||
-rw-r--r-- | mm/slab.h | 29 | ||||
-rw-r--r-- | mm/slab_common.c | 19 | ||||
-rw-r--r-- | mm/slub.c | 30 | ||||
-rw-r--r-- | mm/util.c | 30 | ||||
-rw-r--r-- | mm/vmacache.c | 19 | ||||
-rw-r--r-- | mm/vmscan.c | 68 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
19 files changed, 604 insertions, 434 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 37f976287068..9635083cd8ec 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -293,14 +293,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); - total_isolated += isolated; - for (i = 0; i < isolated; i++) { - list_add(&page->lru, freelist); - page++; - } - - /* If a page was split, advance to the end of it */ if (isolated) { + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ blockpfn += isolated - 1; cursor += isolated - 1; continue; @@ -309,9 +309,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, isolate_fail: if (strict) break; - else - continue; - } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); diff --git a/mm/fremap.c b/mm/fremap.c index 34feba60a17e..2c5646f11f41 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ptfile = pgoff_to_pte(pgoff); - if (!pte_none(*pte)) { - if (pte_present(*pte) && pte_soft_dirty(*pte)) - pte_file_mksoft_dirty(ptfile); + if (!pte_none(*pte)) zap_pte(mm, vma, addr, pte); - } - set_pte_at(mm, addr, pte, ptfile); + set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 64635f5278ff..7577c40f2ad7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1800,7 +1800,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) { int mapcount, mapcount2; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_pgoff(page); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd30f22b35e0..86c7c5bbc2f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,6 +31,7 @@ #include <linux/io.h> #include <linux/hugetlb.h> +#include <linux/hugetlb_inline.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include "internal.h" @@ -1172,6 +1173,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29501f040568..e59f5729e5e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2944,7 +2944,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; int ret = 0; @@ -2982,7 +2982,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { res_counter_uncharge(&memcg->res, size); if (do_swap_account) @@ -3531,11 +3531,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from @@ -6777,30 +6778,29 @@ static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, } #endif -static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, +static int mem_cgroup_count_precharge_pte(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; - pte_t *pte; + if (get_mctgt_type(walk->vma, addr, *pte, NULL)) + mc.precharge++; /* increment precharge temporarily */ + return 0; +} + +static int mem_cgroup_count_precharge_pmd(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); - return 0; + /* don't call mem_cgroup_count_precharge_pte() */ + walk->skip = 1; } - - if (pmd_trans_unstable(pmd)) - return 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, *pte, NULL)) - mc.precharge++; /* increment precharge temporarily */ - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - return 0; } @@ -6809,18 +6809,14 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; + struct mm_walk mem_cgroup_count_precharge_walk = { + .pmd_entry = mem_cgroup_count_precharge_pmd, + .pte_entry = mem_cgroup_count_precharge_pte, + .mm = mm, + }; down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct mm_walk mem_cgroup_count_precharge_walk = { - .pmd_entry = mem_cgroup_count_precharge_pte_range, - .mm = mm, - .private = vma, - }; - if (is_vm_hugetlb_page(vma)) - continue; - walk_page_range(vma->vm_start, vma->vm_end, - &mem_cgroup_count_precharge_walk); - } + for (vma = mm->mmap; vma; vma = vma->vm_next) + walk_page_vma(vma, &mem_cgroup_count_precharge_walk); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -6959,7 +6955,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, struct mm_walk *walk) { int ret = 0; - struct vm_area_struct *vma = walk->private; + struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; enum mc_target_type target_type; @@ -7060,6 +7056,10 @@ put: /* get_mctgt_type() gets the page */ static void mem_cgroup_move_charge(struct mm_struct *mm) { struct vm_area_struct *vma; + struct mm_walk mem_cgroup_move_charge_walk = { + .pmd_entry = mem_cgroup_move_charge_pte_range, + .mm = mm, + }; lru_add_drain_all(); retry: @@ -7075,24 +7075,8 @@ retry: cond_resched(); goto retry; } - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int ret; - struct mm_walk mem_cgroup_move_charge_walk = { - .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, - .private = vma, - }; - if (is_vm_hugetlb_page(vma)) - continue; - ret = walk_page_range(vma->vm_start, vma->vm_end, - &mem_cgroup_move_charge_walk); - if (ret) - /* - * means we have consumed all precharges and failed in - * doing additional charge. Just abandon here. - */ - break; - } + for (vma = mm->mmap; vma; vma = vma->vm_next) + walk_page_vma(vma, &mem_cgroup_move_charge_walk); up_read(&mm->mmap_sem); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 35ef28acf137..12ac5df4d49a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -202,7 +202,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = page_size_order(page) + PAGE_SHIFT; if ((flags & MF_ACTION_REQUIRED) && t == current) { si.si_code = BUS_MCEERR_AR; @@ -404,7 +404,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff = page_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; @@ -437,7 +437,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_pgoff(page); if (!task_early_kill(tsk)) continue; diff --git a/mm/memory.c b/mm/memory.c index d0f0bef3be48..9f562946f5e7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3578,6 +3578,8 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3608,6 +3610,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (set_page_dirty(fault_page)) dirtied = 1; + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ mapping = fault_page->mapping; unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78e1472933ea..9d2ef4111a4c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -476,140 +476,70 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + struct vm_area_struct *prev; +}; + /* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. */ -static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_pages_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) { - pte_t *orig_pte; - pte_t *pte; - spinlock_t *ptl; - - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { - struct page *page; - int nid; + struct vm_area_struct *vma = walk->vma; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; - if (!pte_present(*pte)) - continue; - page = vm_normal_page(vma, addr, *pte); - if (!page) - continue; - /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. - */ - if (PageReserved(page)) - continue; - nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) - continue; + if (!pte_present(*pte)) + return 0; + page = vm_normal_page(vma, addr, *pte); + if (!page) + return 0; + /* + * vm_normal_page() filters out zero pages, but there might + * still be PageReserved pages to skip, perhaps in a VDSO. + */ + if (PageReserved(page)) + return 0; + nid = page_to_nid(page); + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + return 0; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, private, flags); - else - break; - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(orig_pte, ptl); - return addr != end; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, qp->pagelist, flags); + return 0; } -static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, - pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_pages_hugetlb(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; int nid; struct page *page; - spinlock_t *ptl; + pte_t entry; - ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); - page = pte_page(huge_ptep_get((pte_t *)pmd)); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + return 0; + page = pte_page(entry); nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) - goto unlock; + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + return 0; /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) - isolate_huge_page(page, private); -unlock: - spin_unlock(ptl); + isolate_huge_page(page, qp->pagelist); #else BUG(); #endif -} - -static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (!pmd_present(*pmd)) - continue; - if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { - queue_pages_hugetlb_pmd_range(vma, pmd, nodes, - flags, private); - continue; - } - split_huge_page_pmd(vma, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - if (queue_pages_pte_range(vma, pmd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pmd++, addr = next, addr != end); - return 0; -} - -static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) - continue; - if (pud_none_or_clear_bad(pud)) - continue; - if (queue_pages_pmd_range(vma, pud, addr, next, nodes, - flags, private)) - return -EIO; - } while (pud++, addr = next, addr != end); - return 0; -} - -static inline int queue_pages_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pgd_t *pgd; - unsigned long next; - - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - if (queue_pages_pud_range(vma, pgd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pgd++, addr = next, addr != end); return 0; } @@ -642,6 +572,45 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, } #endif /* CONFIG_NUMA_BALANCING */ +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long endvma = vma->vm_end; + unsigned long flags = qp->flags; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return -EFAULT; + if (qp->prev && qp->prev->vm_end < vma->vm_start) + return -EFAULT; + } + + qp->prev = vma; + walk->skip = 1; + + if (vma->vm_flags & VM_PFNMAP) + return 0; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + return 0; + } + + if ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma))) + /* queue pages from current vma */ + walk->skip = 0; + return 0; +} + /* * Walk through page tables and collect pages to be migrated. * @@ -651,51 +620,29 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, */ static struct vm_area_struct * queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, void *private) + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { int err; - struct vm_area_struct *first, *vma, *prev; - - - first = find_vma(mm, start); - if (!first) - return ERR_PTR(-EFAULT); - prev = NULL; - for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); - if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); - } - - if (flags & MPOL_MF_LAZY) { - change_prot_numa(vma, start, endvma); - goto next; - } - - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) { - - err = queue_pages_pgd_range(vma, start, endvma, nodes, - flags, private); - if (err) { - first = ERR_PTR(err); - break; - } - } -next: - prev = vma; - } - return first; + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .prev = NULL, + }; + struct mm_walk queue_pages_walk = { + .hugetlb_entry = queue_pages_hugetlb, + .pte_entry = queue_pages_pte, + .test_walk = queue_pages_test_walk, + .mm = mm, + .private = &qp, + }; + + err = walk_page_range(start, end, &queue_pages_walk); + if (err < 0) + return ERR_PTR(err); + else + return find_vma(mm, start); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..7cfdcd808f52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf502c5..b2a075ffb96e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,29 +3,58 @@ #include <linux/sched.h> #include <linux/hugetlb.h> -static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) +/* + * Check the current skip status of page table walker. + * + * Here what I mean by skip is to skip lower level walking, and that was + * determined for each entry independently. For example, when walk_pmd_range + * handles a pmd_trans_huge we don't have to walk over ptes under that pmd, + * and the skipping does not affect the walking over ptes under other pmds. + * That's why we reset @walk->skip after tested. + */ +static bool skip_lower_level_walking(struct mm_walk *walk) { + if (walk->skip) { + walk->skip = 0; + return true; + } + return false; +} + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mm_struct *mm = walk->mm; pte_t *pte; + pte_t *orig_pte; + spinlock_t *ptl; int err = 0; - pte = pte_offset_map(pmd, addr); - for (;;) { + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { + if (pte_none(*pte)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, addr + PAGE_SIZE, + walk); + if (err) + break; + continue; + } + /* + * Callers should have their own way to handle swap entries + * in walk->pte_entry(). + */ err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; - addr += PAGE_SIZE; - if (addr == end) - break; - pte++; - } - - pte_unmap(pte); - return err; + } while (pte++, addr += PAGE_SIZE, addr < end); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + return addr == end ? 0 : err; } -static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct mm_walk *walk) +static int walk_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, struct mm_walk *walk) { pmd_t *pmd; unsigned long next; @@ -35,6 +64,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); @@ -42,35 +72,32 @@ again: break; continue; } - /* - * This implies that each ->pmd_entry() handler - * needs to know about pmd_trans_huge() pmds - */ - if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, walk); - if (err) - break; - /* - * Check this here so we only break down trans_huge - * pages when we _need_ to - */ - if (!walk->pte_entry) - continue; + if (walk->pmd_entry) { + err = walk->pmd_entry(pmd, addr, next, walk); + if (skip_lower_level_walking(walk)) + continue; + if (err) + break; + } - split_huge_page_pmd_mm(walk->mm, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - goto again; - err = walk_pte_range(pmd, addr, next, walk); - if (err) - break; - } while (pmd++, addr = next, addr != end); + if (walk->pte_entry) { + if (walk->vma) { + split_huge_page_pmd(walk->vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + goto again; + } + err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + } + } while (pmd++, addr = next, addr < end); return err; } -static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, - struct mm_walk *walk) +static int walk_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; @@ -79,6 +106,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); @@ -86,13 +114,58 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, break; continue; } - if (walk->pud_entry) + + if (walk->pud_entry) { err = walk->pud_entry(pud, addr, next, walk); - if (!err && (walk->pmd_entry || walk->pte_entry)) + if (skip_lower_level_walking(walk)) + continue; + if (err) + break; + } + + if (walk->pmd_entry || walk->pte_entry) { err = walk_pmd_range(pud, addr, next, walk); - if (err) - break; - } while (pud++, addr = next, addr != end); + if (err) + break; + } + } while (pud++, addr = next, addr < end); + + return err; +} + +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + + if (walk->pgd_entry) { + err = walk->pgd_entry(pgd, addr, next, walk); + if (skip_lower_level_walking(walk)) + continue; + if (err) + break; + } + + if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) { + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; + } + } while (pgd++, addr = next, addr < end); return err; } @@ -105,144 +178,180 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, return boundary < end ? boundary : end; } -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct mm_walk *walk) +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) { + struct mm_struct *mm = walk->mm; + struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); pte_t *pte; int err = 0; + spinlock_t *ptl; do { next = hugetlb_entry_end(h, addr, end); pte = huge_pte_offset(walk->mm, addr & hmask); - if (pte && walk->hugetlb_entry) - err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (!pte) + continue; + ptl = huge_pte_lock(h, mm, pte); + /* + * Callers should have their own way to handle swap entries + * in walk->hugetlb_entry(). + */ + if (walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, addr, next, walk); + spin_unlock(ptl); if (err) - return err; + break; } while (addr = next, addr != end); - - return 0; + cond_resched(); + return err; } #else /* CONFIG_HUGETLB_PAGE */ -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct mm_walk *walk) +static inline int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it. When we skip it, we set @walk->skip to 1. + * The return value is used to control the page table walking to + * continue (for zero) or not (for non-zero). + * + * Default check (only VM_PFNMAP check for now) is used when the caller + * doesn't define test_walk() callback. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (walk->test_walk) + return walk->test_walk(start, end, walk); + /* + * Do not walk over vma(VM_PFNMAP), because we have no valid struct + * page backing a VM_PFNMAP range. See also commit a9ff785e4437. + */ + if (vma->vm_flags & VM_PFNMAP) + walk->skip = 1; + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + + if (vma && is_vm_hugetlb_page(vma)) { + if (walk->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + + return err; +} /** - * walk_page_range - walk a memory map's page tables with a callback - * @addr: starting address - * @end: ending address - * @walk: set of callbacks to invoke for each level of the tree + * walk_page_range - walk page table with caller specific callbacks + * + * Recursively walk the page table tree of the process represented by + * @walk->mm within the virtual address range [@start, @end). In walking, + * we can call caller-specific callback functions against each entry. * - * Recursively walk the page table for the memory area in a VMA, - * calling supplied callbacks. Callbacks are called in-order (first - * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, - * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the vma (for example by checking vm_flags.) + * walk_page_test() and @walk->test_walk() do that check. * - * Each callback receives an entry pointer and the start and end of the - * associated range, and a copy of the original mm_walk for access to - * the ->private or ->mm fields. + * If any callback returns a non-zero value, the page table walk is aborted + * immediately and the return value is propagated back to the caller. + * Note that the meaning of the positive returned value can be defined + * by the caller for its own purpose. * - * Usually no locks are taken, but splitting transparent huge page may - * take page table lock. And the bottom level iterator will map PTE - * directories from highmem if necessary. + * If the caller defines multiple callbacks in different levels, the + * callbacks are called in depth-first manner. It could happen that + * multiple callbacks are called on a address. For example if some caller + * defines test_walk(), pmd_entry(), and pte_entry(), then callbacks are + * called in the order of test_walk(), pmd_entry(), and pte_entry(). + * If you don't want to go down to lower level at some point and move to + * the next entry in the same level, you set @walk->skip to 1. + * For example if you succeed to handle some pmd entry as trans_huge entry, + * you need not call walk_pte_range() any more, so set it to avoid that. + * We can't determine whether to go down to lower level with the return + * value of the callback, because the whole range of return values (0, >0, + * and <0) are used up for other meanings. * - * If any callback returns a non-zero value, the walk is aborted and - * the return value is propagated back to the caller. Otherwise 0 is returned. + * Each callback can access to the vma over which it is doing page table + * walk right now via @walk->vma. @walk->vma is set to NULL in walking + * outside a vma. If you want to access to some caller-specific data from + * callbacks, @walk->private should be helpful. * - * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry - * is !NULL. + * The callers should hold @walk->mm->mmap_sem. Note that the lower level + * iterators can take page table lock in lowest level iteration and/or + * in split_huge_page_pmd(). */ -int walk_page_range(unsigned long addr, unsigned long end, +int walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { - pgd_t *pgd; - unsigned long next; int err = 0; + struct vm_area_struct *vma; + unsigned long next; - if (addr >= end) - return err; + if (start >= end) + return -EINVAL; if (!walk->mm) return -EINVAL; VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma = NULL; + vma = find_vma(walk->mm, start); + if (!vma) { /* after the last vma */ + walk->vma = NULL; + next = end; + } else if (start < vma->vm_start) { /* outside the found vma */ + walk->vma = NULL; + next = vma->vm_start; + } else { /* inside the found vma */ + walk->vma = vma; + next = min(end, vma->vm_end); - next = pgd_addr_end(addr, end); - - /* - * This function was not intended to be vma based. - * But there are vma special cases to be handled: - * - hugetlb vma's - * - VM_PFNMAP vma's - */ - vma = find_vma(walk->mm, addr); - if (vma) { - /* - * There are no page structures backing a VM_PFNMAP - * range, so do not allow split_huge_page_pmd(). - */ - if ((vma->vm_start <= addr) && - (vma->vm_flags & VM_PFNMAP)) { - next = vma->vm_end; - pgd = pgd_offset(walk->mm, next); - continue; - } - /* - * Handle hugetlb vma individually because pagetable - * walk for the hugetlb page is dependent on the - * architecture and we can't handled it in the same - * manner as non-huge pages. - */ - if (walk->hugetlb_entry && (vma->vm_start <= addr) && - is_vm_hugetlb_page(vma)) { - if (vma->vm_end < next) - next = vma->vm_end; - /* - * Hugepage is very tightly coupled with vma, - * so walk through hugetlb entries within a - * given vma. - */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); + err = walk_page_test(start, next, walk); + if (skip_lower_level_walking(walk)) continue; - } - } - - if (pgd_none_or_clear_bad(pgd)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); if (err) break; - pgd++; - continue; } - if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, walk); - if (!err && - (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) - err = walk_pud_range(pgd, addr, next, walk); + err = __walk_page_range(start, next, walk); if (err) break; - pgd++; - } while (addr = next, addr < end); - + } while (start = next, start < end); return err; } + +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +{ + int err; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON(!vma); + walk->vma = vma; + err = walk_page_test(vma->vm_start, vma->vm_end, walk); + if (skip_lower_level_walking(walk)) + return 0; + if (err) + return err; + return __walk_page_range(vma->vm_start, vma->vm_end, walk); +} diff --git a/mm/rmap.c b/mm/rmap.c index 9c3e77396d1a..e065ba798fde 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -515,11 +515,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) static inline unsigned long __vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (unlikely(is_vm_hugetlb_page(vma))) - pgoff = page->index << huge_page_order(page_hstate(page)); - + pgoff_t pgoff = page_pgoff(page); return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); } @@ -1024,7 +1020,7 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) { + if (!mlocked_vma_newpage(vma, page) && !PageUnevictable(page)) { SetPageActive(page); lru_cache_add(page); } else @@ -1359,7 +1355,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (page->index != linear_page_index(vma, address)) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(pteval)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, address, pte, ptfile); } @@ -1609,7 +1605,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_pgoff(page); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1650,7 +1646,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); + pgoff_t pgoff = page_pgoff(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; diff --git a/mm/slab.c b/mm/slab.c index 388cb1ae6fbc..cbcd2fa7af2f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1681,8 +1681,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; + if (memcg_charge_slab(cachep, flags, cachep->gfporder)) + return NULL; + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { + memcg_uncharge_slab(cachep, cachep->gfporder); if (!(flags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(cachep, flags, nodeid); return NULL; @@ -1741,7 +1745,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_memcg_kmem_pages(page, cachep->gfporder); + __free_pages(page, cachep->gfporder); + memcg_uncharge_slab(cachep, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index 3045316b7c9d..3db3c52f80a2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -191,6 +191,26 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; return s->memcg_params->root_cache; } + +static __always_inline int memcg_charge_slab(struct kmem_cache *s, + gfp_t gfp, int order) +{ + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return memcg_charge_kmem(s->memcg_params->memcg, gfp, + PAGE_SIZE << order); +} + +static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ + if (!memcg_kmem_enabled()) + return; + if (is_root_cache(s)) + return; + memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order); +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -226,6 +246,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; } + +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index f3cfccf76dda..963f0376e4c1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -290,12 +290,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) { + if (IS_ERR(s)) kfree(cache_name); - goto out_unlock; - } - - s->allocflags |= __GFP_KMEMCG; out_unlock: mutex_unlock(&slab_mutex); @@ -577,6 +573,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index 5e234f1f8853..fa7a1817835e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1317,17 +1317,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; + if (memcg_charge_slab(s, flags, order)) + return NULL; + if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1349,7 +1358,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; alloc_gfp = flags; @@ -1357,7 +1366,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1473,7 +1482,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); + __free_pages(page, order); + memcg_uncharge_slab(s, order); } #define need_reserve_slab_rcu \ @@ -3325,8 +3335,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3395,7 +3405,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); diff --git a/mm/util.c b/mm/util.c index f380af7ea779..efadeaaef81e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -3,6 +3,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> +#include <linux/ctype.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/security.h> @@ -64,6 +65,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) EXPORT_SYMBOL(kstrndup); /** + * kstrimdup - Trim and copy a %NUL terminated string. + * @s: the string to trim and duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Returns an address, which the caller must kfree, containing + * a duplicate of the passed string with leading and/or trailing + * whitespace (as defined by isspace) removed. + */ +char *kstrimdup(const char *s, gfp_t gfp) +{ + char *buf; + char *begin = skip_spaces(s); + size_t len = strlen(begin); + + while (len && isspace(begin[len - 1])) + len--; + + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) + return NULL; + + memcpy(buf, begin, len); + buf[len] = '\0'; + + return buf; +} +EXPORT_SYMBOL(kstrimdup); + +/** * kmemdup - duplicate region of memory * * @src: memory region to duplicate diff --git a/mm/vmacache.c b/mm/vmacache.c index d4224b397c0e..61c38ae9f54b 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm) { struct task_struct *g, *p; + /* + * Single threaded tasks need not iterate the entire + * list of process. We can avoid the flushing as well + * since the mm's seqnum was increased and don't have + * to worry about other threads' seqnum. Current's + * flush will occur upon the next lookup. + */ + if (atomic_read(&mm->mm_users) == 1) + return; + rcu_read_lock(); for_each_process_thread(g, p) { /* @@ -78,11 +88,14 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) if (!vmacache_valid(mm)) return NULL; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; if (vma && vma->vm_start <= addr && vma->vm_end > addr) { BUG_ON(vma->vm_mm != mm); + count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; } } @@ -100,11 +113,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, if (!vmacache_valid(mm)) return NULL; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; - if (vma && vma->vm_start == start && vma->vm_end == end) + if (vma && vma->vm_start == start && vma->vm_end == end) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; + } } return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9b6497eda806..6cdda104f629 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1158,7 +1158,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } @@ -1866,6 +1866,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1971,39 +1973,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } diff --git a/mm/vmstat.c b/mm/vmstat.c index 302dd076b8bf..82ce17ce58c4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -866,6 +866,10 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ +#ifdef CONFIG_DEBUG_VM_VMACACHE + "vmacache_find_calls", + "vmacache_find_hits", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |