From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'mm/rmap.c') diff --git a/mm/rmap.c b/mm/rmap.c index 0f3b7cda2a24..7b5b51d25fc5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.2.3 From 108d6642ad81bb1d62b401490a334d2c12397517 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:36 -0700 Subject: mm anon rmap: remove anon_vma_moveto_tail mremap() had a clever optimization where move_ptes() did not take the anon_vma lock to avoid a race with anon rmap users such as page migration. Instead, the avc's were ordered in such a way that the origin vma was always visited by rmap before the destination. This ordering and the use of page table locks rmap usage safe. However, we want to replace the use of linked lists in anon rmap with an interval tree, and this will make it harder to impose such ordering as the interval tree will always be sorted by the avc->vma->vm_pgoff value. For now, let's replace the anon_vma_moveto_tail() ordering function with proper anon_vma locking in move_ptes(). Once we have the anon interval tree in place, we will re-introduce an optimization to avoid taking these locks in the most common cases. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 - mm/mmap.c | 3 +-- mm/mremap.c | 14 +++++--------- mm/rmap.c | 45 --------------------------------------------- 4 files changed, 6 insertions(+), 57 deletions(-) (limited to 'mm/rmap.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3fce545df394..7f32cec57e67 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -120,7 +120,6 @@ void anon_vma_init(void); /* create anon_vma_cachep */ int anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); -void anon_vma_moveto_tail(struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); static inline void anon_vma_merge(struct vm_area_struct *vma, diff --git a/mm/mmap.c b/mm/mmap.c index 5ac533f88e99..66984aab7915 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2378,8 +2378,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, */ VM_BUG_ON(faulted_in_anon_vma); *vmap = new_vma; - } else - anon_vma_moveto_tail(new_vma); + } } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e48d05..5588bb6e9295 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long new_addr) { struct address_space *mapping = NULL; + struct anon_vma *anon_vma = vma->anon_vma; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -88,6 +89,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, mapping = vma->vm_file->f_mapping; mutex_lock(&mapping->i_mmap_mutex); } + if (anon_vma) + anon_vma_lock(anon_vma); /* * We don't have to worry about the ordering of src and dst @@ -114,6 +117,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); + if (anon_vma) + anon_vma_unlock(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } @@ -220,15 +225,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { - /* - * Before moving the page tables from the new vma to - * the old vma, we need to be sure the old vma is - * queued after new vma in the same_anon_vma list to - * prevent SMP races with rmap_walk (that could lead - * rmap_walk to miss some page table). - */ - anon_vma_moveto_tail(vma); - /* * On error, move entries back from new area to old, * which will succeed since page tables still there, diff --git a/mm/rmap.c b/mm/rmap.c index 7b5b51d25fc5..8cbd62fde0f1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -268,51 +268,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) return -ENOMEM; } -/* - * Some rmap walk that needs to find all ptes/hugepmds without false - * negatives (like migrate and split_huge_page) running concurrent - * with operations that copy or move pagetables (like mremap() and - * fork()) to be safe. They depend on the anon_vma "same_anon_vma" - * list to be in a certain order: the dst_vma must be placed after the - * src_vma in the list. This is always guaranteed by fork() but - * mremap() needs to call this function to enforce it in case the - * dst_vma isn't newly allocated and chained with the anon_vma_clone() - * function but just an extension of a pre-existing vma through - * vma_merge. - * - * NOTE: the same_anon_vma list can still be changed by other - * processes while mremap runs because mremap doesn't hold the - * anon_vma mutex to prevent modifications to the list while it - * runs. All we need to enforce is that the relative order of this - * process vmas isn't changing (we don't care about other vmas - * order). Each vma corresponds to an anon_vma_chain structure so - * there's no risk that other processes calling anon_vma_moveto_tail() - * and changing the same_anon_vma list under mremap() will screw with - * the relative order of this process vmas in the list, because we - * they can't alter the order of any vma that belongs to this - * process. And there can't be another anon_vma_moveto_tail() running - * concurrently with mremap() coming from this process because we hold - * the mmap_sem for the whole mremap(). fork() ordering dependency - * also shouldn't be affected because fork() only cares that the - * parent vmas are placed in the list before the child vmas and - * anon_vma_moveto_tail() won't reorder vmas from either the fork() - * parent or child. - */ -void anon_vma_moveto_tail(struct vm_area_struct *dst) -{ - struct anon_vma_chain *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma = pavc->anon_vma; - VM_BUG_ON(pavc->vma != dst); - root = lock_anon_vma_root(root, anon_vma); - list_del(&pavc->same_anon_vma); - list_add_tail(&pavc->same_anon_vma, &anon_vma->head); - } - unlock_anon_vma_root(root); -} - /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. -- cgit v1.2.3 From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:39 -0700 Subject: mm anon rmap: replace same_anon_vma linked list with an interval tree. When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++ include/linux/rmap.h | 11 ++++---- mm/huge_memory.c | 5 ++-- mm/interval_tree.c | 14 ++++++++++ mm/ksm.c | 9 ++++--- mm/memory-failure.c | 5 +++- mm/mmap.c | 73 +++++++++++++++++++++++++++++++++++++++------------- mm/rmap.c | 24 ++++++++--------- 8 files changed, 114 insertions(+), 41 deletions(-) (limited to 'mm/rmap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index f1d9aaadb566..0cdab4e0f814 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -20,6 +20,7 @@ struct mempolicy; struct anon_vma; +struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; @@ -1377,6 +1378,19 @@ static inline void vma_nonlinear_insert(struct vm_area_struct *vma, list_add_tail(&vma->shared.nonlinear, list); } +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root); +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root); +struct anon_vma_chain *anon_vma_interval_tree_iter_first( + struct rb_root *root, unsigned long start, unsigned long last); +struct anon_vma_chain *anon_vma_interval_tree_iter_next( + struct anon_vma_chain *node, unsigned long start, unsigned long last); + +#define anon_vma_interval_tree_foreach(avc, root, start, last) \ + for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ + avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) + /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7f32cec57e67..dce44f7d3ed8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -37,14 +37,14 @@ struct anon_vma { atomic_t refcount; /* - * NOTE: the LSB of the head.next is set by + * NOTE: the LSB of the rb_root.rb_node is set by * mm_take_all_locks() _after_ taking the above lock. So the - * head must only be read/written after taking the above lock + * rb_root must only be read/written after taking the above lock * to be sure to see a valid next pointer. The LSB bit itself * is serialized by a system wide lock only visible to * mm_take_all_locks() (mm_all_locks_mutex). */ - struct list_head head; /* Chain of private "related" vmas */ + struct rb_root rb_root; /* Interval tree of private "related" vmas */ }; /* @@ -57,14 +57,15 @@ struct anon_vma { * with a VMA, or the VMAs associated with an anon_vma. * The "same_vma" list contains the anon_vma_chains linking * all the anon_vmas associated with this VMA. - * The "same_anon_vma" list contains the anon_vma_chains + * The "rb" field indexes on an interval tree the anon_vma_chains * which link all the VMAs associated with this anon_vma. */ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ - struct list_head same_anon_vma; /* locked by anon_vma->mutex */ + struct rb_node rb; /* locked by anon_vma->mutex */ + unsigned long rb_subtree_last; }; #ifdef CONFIG_MMU diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 010d32944d14..ce59ada09462 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); BUG_ON(PageTail(page)); mapcount = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); @@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page, __split_huge_page_refcount(page); mapcount2 = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4ab7b9ec3a56..f7c72cd35e1d 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -8,6 +8,7 @@ #include #include +#include #include static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) @@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, rb_insert_augmented(&node->shared.linear.rb, root, &vma_interval_tree_augment); } + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree) diff --git a/mm/ksm.c b/mm/ksm.c index 9638620a7530..14ee5cf8a513 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1618,7 +1618,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1671,7 +1672,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1723,7 +1725,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c38a6257d082..6c5899b9034a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; + pgoff_t pgoff; av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ return; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; if (!task_early_kill(tsk)) continue; - list_for_each_entry(vmac, &av->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; diff --git a/mm/mmap.c b/mm/mmap.c index 66984aab7915..2e580ed79211 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + static int find_vma_links(struct mm_struct *mm, unsigned long addr, unsigned long end, struct vm_area_struct **pprev, struct rb_node ***rb_link, struct rb_node **rb_parent) @@ -565,20 +597,17 @@ again: remove_next = 1 + (end > next->vm_end); vma_adjust_trans_huge(vma, start, end, adjust_next); - /* - * When changing only vma->vm_end, we don't really need anon_vma - * lock. This is a fairly rare case by itself, but the anon_vma - * lock may be shared between many sibling processes. Skipping - * the lock for brk adjustments makes a difference sometimes. - */ - if (vma->anon_vma && (importer || start != vma->vm_start)) { - anon_vma = vma->anon_vma; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - } else if (adjust_next && next->anon_vma) - anon_vma = next->anon_vma; - if (anon_vma) anon_vma_lock(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); + } if (root) { flush_dcache_mmap_lock(mapping); @@ -619,8 +648,12 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } - if (anon_vma) + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock(anon_vma); + } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } @@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma, if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } @@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); } } @@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * - * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * @@ -2619,13 +2656,13 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->head will + * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next @@ -2633,7 +2670,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); anon_vma_unlock(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index 8cbd62fde0f1..9c61bf387fd1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - - /* - * It's critical to add new vmas to the tail of the anon_vma, - * see comment in huge_memory.c:__split_huge_page(). - */ - list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -336,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); - list_del(&avc->same_anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (list_empty(&anon_vma->head)) + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) continue; list_del(&avc->same_vma); @@ -371,7 +366,7 @@ static void anon_vma_ctor(void *data) mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); - INIT_LIST_HEAD(&anon_vma->head); + anon_vma->rb_root = RB_ROOT; } void __init anon_vma_init(void) @@ -724,6 +719,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int referenced = 0; @@ -732,7 +728,8 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1445,6 +1442,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1452,7 +1450,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) if (!anon_vma) return ret; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address; @@ -1668,6 +1667,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct anon_vma *anon_vma; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1681,7 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (!anon_vma) return ret; anon_vma_lock(anon_vma); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) -- cgit v1.2.3 From 86c2ad19956f84f2191e062fcb979367b6365871 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:42 -0700 Subject: mm rmap: remove vma_address check for address inside vma In file and anon rmap, we use interval trees to find potentially relevant vmas and then call vma_address() to find the virtual address the given page might be found at in these vmas. vma_address() used to include a check that the returned address falls within the limits of the vma, but this check isn't necessary now that we always use interval trees in rmap: the interval tree just doesn't return any vmas which this check would find to be irrelevant. As a result, we can replace the use of -EFAULT error code (which then needed to be checked in every call site) with a VM_BUG_ON(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ---- mm/rmap.c | 48 +++++++++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 31 deletions(-) (limited to 'mm/rmap.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ce59ada09462..7cf8b0ec11ec 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1386,8 +1386,6 @@ static void __split_huge_page(struct page *page, struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount += __split_huge_page_splitting(page, vma, addr); } /* @@ -1412,8 +1410,6 @@ static void __split_huge_page(struct page *page, struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) diff --git a/mm/rmap.c b/mm/rmap.c index 9c61bf387fd1..28777412de62 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -510,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) /* * At what user virtual address is page expected in @vma? - * Returns virtual address or -EFAULT if page's index/offset is not - * within the range mapped the @vma. */ -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long address; if (unlikely(is_vm_hugetlb_page(vma))) pgoff = page->index << huge_page_order(page_hstate(page)); - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within @vma mapping range */ - return -EFAULT; - } + + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + return address; } @@ -535,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { + unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -550,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else return -EFAULT; - return vma_address(page, vma); + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return -EFAULT; + return address; } /* @@ -624,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *pte; spinlock_t *ptl; - address = vma_address(page, vma); - if (address == -EFAULT) /* out of vma range */ + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return 0; pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); if (!pte) /* the page is not in this mm */ @@ -732,8 +740,6 @@ static int page_referenced_anon(struct page *page, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -799,8 +805,6 @@ static int page_referenced_file(struct page *page, vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -904,8 +908,6 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret += page_mkclean_one(page, vma, address); } } @@ -1468,8 +1470,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; @@ -1508,8 +1508,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; @@ -1684,8 +1682,6 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; @@ -1707,8 +1703,6 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; -- cgit v1.2.3 From 39b5f29ac1f988c1615fbc9c69f6651ab0d0c3c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:18 -0700 Subject: mm: remove vma arg from page_evictable page_evictable(page, vma) is an irritant: almost all its callers pass NULL for vma. Remove the vma arg and use mlocked_vma_newpage(vma, page) explicitly in the couple of places it's needed. But in those places we don't even need page_evictable() itself! They're dealing with a freshly allocated anonymous page, which has no "mapping" and cannot be mlocked yet. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/unevictable-lru.txt | 10 +++------- include/linux/swap.h | 2 +- mm/internal.h | 5 ++--- mm/ksm.c | 2 +- mm/rmap.c | 2 +- mm/swap.c | 2 +- mm/vmscan.c | 27 +++++++++------------------ 7 files changed, 18 insertions(+), 32 deletions(-) (limited to 'mm/rmap.c') diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 323ff5dba1cc..a68db7692ee8 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -197,12 +197,8 @@ the pages are also "rescued" from the unevictable list in the process of freeing them. page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked, -and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is -VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and -update the appropriate statistics if the vma is VM_LOCKED. This method allows -efficient "culling" of pages in the fault path that are being faulted in to -VM_LOCKED VMAs. +flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is +faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED. VMSCAN'S HANDLING OF UNEVICTABLE PAGES @@ -651,7 +647,7 @@ PAGE RECLAIM IN shrink_*_list() ------------------------------- shrink_active_list() culls any obviously unevictable pages - i.e. -!page_evictable(page, NULL) - diverting these to the unevictable list. +!page_evictable(page) - diverting these to the unevictable list. However, shrink_active_list() only sees unevictable pages that made it onto the active/inactive lru lists. Note that these pages do not have PageUnevictable set - otherwise they would be on the unevictable list and shrink_active_list diff --git a/include/linux/swap.h b/include/linux/swap.h index 388e70601413..68df9c17fbbb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -281,7 +281,7 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -extern int page_evictable(struct page *page, struct vm_area_struct *vma); +extern int page_evictable(struct page *page); extern void check_move_unevictable_pages(struct page **, int nr_pages); extern unsigned long scan_unevictable_pages; diff --git a/mm/internal.h b/mm/internal.h index 7f72f249bc29..78f25d6cc6a7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path via page_evictable() for a new page - * to determine if it's being mapped into a LOCKED vma. - * If so, mark page as mlocked. + * Called only in fault path, to determine if a new page is being + * mapped into a LOCKED vma. If it is, mark page as mlocked. */ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, struct page *page) diff --git a/mm/ksm.c b/mm/ksm.c index 14ee5cf8a513..ecbc090cdaad 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1586,7 +1586,7 @@ struct page *ksm_does_need_to_copy(struct page *page, SetPageSwapBacked(new_page); __set_page_locked(new_page); - if (page_evictable(new_page, vma)) + if (!mlocked_vma_newpage(vma, new_page)) lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(new_page); diff --git a/mm/rmap.c b/mm/rmap.c index 28777412de62..0d86433e42d7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1080,7 +1080,7 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (page_evictable(page, vma)) + if (!mlocked_vma_newpage(vma, page)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(page); diff --git a/mm/swap.c b/mm/swap.c index f76c76c7501b..6310dc2008ff 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -751,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, SetPageLRU(page_tail); - if (page_evictable(page_tail, NULL)) { + if (page_evictable(page_tail)) { if (PageActive(page)) { SetPageActive(page_tail); active = 1; diff --git a/mm/vmscan.c b/mm/vmscan.c index b010efc43891..8b627309dd44 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) redo: ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an @@ -587,7 +587,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (lru == LRU_UNEVICTABLE && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -709,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) + if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) @@ -1217,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON(PageLRU(page)); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); putback_lru_page(page); spin_lock_irq(&zone->lru_lock); @@ -1470,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, page = lru_to_page(&l_hold); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } @@ -3414,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * page_evictable - test whether a page is evictable * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page) { - - if (mapping_unevictable(page_mapping(page))) - return 0; - - if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) - return 0; - - return 1; + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); } #ifdef CONFIG_SHMEM @@ -3472,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) if (!PageLRU(page) || !PageUnevictable(page)) continue; - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); VM_BUG_ON(PageActive(page)); -- cgit v1.2.3 From e6c509f85455041d3d7c4b863bf80bc294288cc1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:19 -0700 Subject: mm: use clear_page_mlock() in page_remove_rmap() We had thought that pages could no longer get freed while still marked as mlocked; but Johannes Weiner posted this program to demonstrate that truncating an mlocked private file mapping containing COWed pages is still mishandled: #include #include #include #include #include #include #include int main(void) { char *map; int fd; system("grep mlockfreed /proc/vmstat"); fd = open("chigurh", O_CREAT|O_EXCL|O_RDWR); unlink("chigurh"); ftruncate(fd, 4096); map = mmap(NULL, 4096, PROT_WRITE, MAP_PRIVATE, fd, 0); map[0] = 11; mlock(map, sizeof(fd)); ftruncate(fd, 0); close(fd); munlock(map, sizeof(fd)); munmap(map, 4096); system("grep mlockfreed /proc/vmstat"); return 0; } The anon COWed pages are not caught by truncation's clear_page_mlock() of the pagecache pages; but unmap_mapping_range() unmaps them, so we ought to look out for them there in page_remove_rmap(). Indeed, why should truncation or invalidation be doing the clear_page_mlock() when removing from pagecache? mlock is a property of mapping in userspace, not a property of pagecache: an mlocked unmapped page is nonsensical. Reported-by: Johannes Weiner Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Michel Lespinasse Cc: Ying Han Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 7 +------ mm/memory.c | 10 +++++----- mm/mlock.c | 16 +++------------- mm/rmap.c | 4 ++++ mm/truncate.c | 4 ---- 5 files changed, 13 insertions(+), 28 deletions(-) (limited to 'mm/rmap.c') diff --git a/mm/internal.h b/mm/internal.h index 78f25d6cc6a7..4dc93e2fe69e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -201,12 +201,7 @@ extern void munlock_vma_page(struct page *page); * If called for a page that is still mapped by mlocked vmas, all we do * is revert to lazy LRU behaviour -- semantics are not broken. */ -extern void __clear_page_mlock(struct page *page); -static inline void clear_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) - __clear_page_mlock(page); -} +extern void clear_page_mlock(struct page *page); /* * mlock_migrate_page - called only from migrate_page_copy() to diff --git a/mm/memory.c b/mm/memory.c index d205e4381a34..5f5d1f039bf4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1577,12 +1577,12 @@ split_fallthrough: if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* - * Because we lock page here and migration is - * blocked by the pte's page reference, we need - * only check for file-cache page truncation. + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. */ - if (page->mapping) - mlock_vma_page(page); + mlock_vma_page(page); unlock_page(page); } } diff --git a/mm/mlock.c b/mm/mlock.c index a948be4b7ba7..de7321592897 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -51,13 +51,10 @@ EXPORT_SYMBOL(can_do_mlock); /* * LRU accounting for clear_page_mlock() */ -void __clear_page_mlock(struct page *page) +void clear_page_mlock(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - - if (!page->mapping) { /* truncated ? */ + if (!TestClearPageMlocked(page)) return; - } dec_zone_page_state(page, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); @@ -290,14 +287,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { lock_page(page); - /* - * Like in __mlock_vma_pages_range(), - * because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - munlock_vma_page(page); + munlock_vma_page(page); unlock_page(page); put_page(page); } diff --git a/mm/rmap.c b/mm/rmap.c index 0d86433e42d7..bf03149f495c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1155,7 +1155,10 @@ void page_remove_rmap(struct page *page) } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_end_update_page_stat(page, &locked, &flags); } + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1165,6 +1168,7 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ + return; out: if (!anon) mem_cgroup_end_update_page_stat(page, &locked, &flags); diff --git a/mm/truncate.c b/mm/truncate.c index f38055cb8af6..d51ce92d6e83 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); - clear_page_mlock(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -394,8 +392,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - clear_page_mlock(page); - spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; -- cgit v1.2.3 From 2ec74c3ef2d8c58d71e0e00336fb6b891192155a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 8 Oct 2012 16:33:33 -0700 Subject: mm: move all mmu notifier invocations to be done outside the PT lock In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli Signed-off-by: Sagi Grimberg Signed-off-by: Haggai Eran Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 47 -------------------------------------------- mm/filemap_xip.c | 4 +++- mm/huge_memory.c | 42 +++++++++++++++++++++++++++++++++------ mm/hugetlb.c | 21 ++++++++++++-------- mm/memory.c | 28 +++++++++++++++++--------- mm/mremap.c | 8 ++++++-- mm/rmap.c | 18 ++++++++++++++--- 7 files changed, 92 insertions(+), 76 deletions(-) (limited to 'mm/rmap.c') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4b7183e98061..bc823c4c028b 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -246,50 +246,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __mmu_notifier_mm_destroy(mm); } -/* - * These two macros will sometime replace ptep_clear_flush. - * ptep_clear_flush is implemented as macro itself, so this also is - * implemented as a macro until ptep_clear_flush will converted to an - * inline function, to diminish the risk of compilation failure. The - * invalidate_page method over time can be moved outside the PT lock - * and these two macros can be later removed. - */ -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __pte = ptep_clear_flush(___vma, ___address, __ptep); \ - mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ - __pte; \ -}) - -#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ -({ \ - pmd_t __pmd; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ - mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ - (__address)+HPAGE_PMD_SIZE);\ - __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ - mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ - (__address)+HPAGE_PMD_SIZE); \ - __pmd; \ -}) - -#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ -({ \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ - mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ - (__address)+HPAGE_PMD_SIZE);\ - pmdp_splitting_flush(___vma, ___address, __pmdp); \ - mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ - (__address)+HPAGE_PMD_SIZE); \ -}) - #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -380,9 +336,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_clear_flush_notify ptep_clear_flush -#define pmdp_clear_flush_notify pmdp_clear_flush -#define pmdp_splitting_flush_notify pmdp_splitting_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a52daee11d3f..a912da6ddfd4 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -192,11 +192,13 @@ retry: if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7740923fb9..08a943b9cf95 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmd_t _pmd; int ret = 0, i; struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, GFP_KERNEL); @@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, cond_resched(); } + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm); @@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, page_remove_rmap(page); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + ret |= VM_FAULT_WRITE; put_page(page); @@ -859,6 +867,7 @@ out: out_free_pages: spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { mem_cgroup_uncharge_page(pages[i]); @@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; struct page *page, *new_page; unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(!vma->anon_vma); spin_lock(&mm->page_table_lock); @@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); - goto out; + goto out_mn; } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); entry = mk_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache(vma, address, pmd); @@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); ret |= VM_FAULT_WRITE; } -out_unlock: spin_unlock(&mm->page_table_lock); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; } struct page *follow_trans_huge_pmd(struct mm_struct *mm, @@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); @@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page, * and it won't wait on the anon_vma->root->mutex to * serialize against split_huge_page*. */ - pmdp_splitting_flush_notify(vma, address, pmd); + pmdp_splitting_flush(vma, address, pmd); ret = 1; } spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; } @@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *ptl; int isolated; unsigned long hstart, hend; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush_notify(vma, address, pmd); + _pmd = pmdp_clear_flush(vma, address, pmd); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de5d1dcf34fe..993f7c1820a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + const unsigned long mmun_start = start; /* For mmu_notifiers */ + const unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); tlb_start_vma(tlb, vma); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { @@ -2425,7 +2427,7 @@ again: if (address < end && !ref_page) goto again; } - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); tlb_end_vma(tlb, vma); } @@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int avoidcopy; int outside_reserve = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_page = pte_page(pte); @@ -2611,6 +2615,9 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* * Retake the page_table_lock to check for racing updates * before the page tables are altered @@ -2619,9 +2626,6 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ - mmu_notifier_invalidate_range_start(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); @@ -2629,10 +2633,11 @@ retry_avoidcopy: hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; - mmu_notifier_invalidate_range_end(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); } + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); return 0; diff --git a/mm/memory.c b/mm/memory.c index 5f5d1f039bf4..b03a4a21c1d0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(vm_flags_t flags) +static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } @@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool is_cow; int ret; /* @@ -1072,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); + is_cow = is_cow_mapping(vma->vm_flags); + mmun_start = addr; + mmun_end = end; + if (is_cow) + mmu_notifier_invalidate_range_start(src_mm, mmun_start, + mmun_end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1089,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); + if (is_cow) + mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); return ret; } @@ -2516,7 +2522,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; int page_mkwrite = 0; @@ -2760,10 +2766,14 @@ gotten: } else mem_cgroup_uncharge_page(new_page); - if (new_page) - page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); + page_cache_release(new_page); + } if (old_page) { /* * Don't let another task, with possibly unlocked vma, diff --git a/mm/mremap.c b/mm/mremap.c index 3b639a4b26bd..1b61c2d3307a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -149,11 +149,15 @@ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; bool need_flush = false; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + mmun_start = old_addr; + mmun_end = old_end; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -197,7 +201,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (likely(need_flush)) flush_tlb_range(vma, old_end-len, old_addr); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); return len + old_addr - old_end; /* how much done */ } diff --git a/mm/rmap.c b/mm/rmap.c index bf03149f495c..7df7984d476c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, spinlock_t *ptl; struct page *page; unsigned long address; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ unsigned long end; int ret = SWAP_AGAIN; int locked_vma = 0; @@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (!pmd_present(*pmd)) return ret; + mmun_start = address; + mmun_end = end; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. @@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (locked_vma) up_read(&vma->vm_mm->mmap_sem); return ret; -- cgit v1.2.3 From ef5d437f71afdf4afdbab99213add99f4b1318fd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 25 Oct 2012 13:37:31 -0700 Subject: mm: fix XFS oops due to dirty pages without buffers on s390 On s390 any write to a page (even from kernel itself) sets architecture specific page dirty bit. Thus when a page is written to via buffered write, HW dirty bit gets set and when we later map and unmap the page, page_remove_rmap() finds the dirty bit and calls set_page_dirty(). Dirtying of a page which shouldn't be dirty can cause all sorts of problems to filesystems. The bug we observed in practice is that buffers from the page get freed, so when the page gets later marked as dirty and writeback writes it, XFS crashes due to an assertion BUG_ON(!PagePrivate(page)) in page_buffers() called from xfs_count_page_state(). Similar problem can also happen when zero_user_segment() call from xfs_vm_writepage() (or block_write_full_page() for that matter) set the hardware dirty bit during writeback, later buffers get freed, and then page unmapped. Fix the issue by ignoring s390 HW dirty bit for page cache pages of mappings with mapping_cap_account_dirty(). This is safe because for such mappings when a page gets marked as writeable in PTE it is also marked dirty in do_wp_page() or do_page_fault(). When the dirty bit is cleared by clear_page_dirty_for_io(), the page gets writeprotected in page_mkclean(). So pagecache page is writeable if and only if it is dirty. Thanks to Hugh Dickins for pointing out mapping has to have mapping_cap_account_dirty() for things to work and proposing a cleaned up variant of the patch. The patch has survived about two hours of running fsx-linux on tmpfs while heavily swapping and several days of running on out build machines where the original problem was triggered. Signed-off-by: Jan Kara Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Hugh Dickins Cc: Heiko Carstens Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'mm/rmap.c') diff --git a/mm/rmap.c b/mm/rmap.c index 7df7984d476c..2ee1ef0f317b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -926,11 +927,8 @@ int page_mkclean(struct page *page) if (page_mapped(page)) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping) ret = page_mkclean_file(mapping, page); - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) - ret = 1; - } } return ret; @@ -1116,6 +1114,7 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1138,8 +1137,19 @@ void page_remove_rmap(struct page *page) * this if the page is anon, so about to be freed; but perhaps * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. + * + * And we can skip it on file pages, so long as the filesystem + * participates in dirty tracking; but need to catch shm and tmpfs + * and ramfs pages which have been modified since creation by read + * fault. + * + * Note that mapping must be decided above, before decrementing + * mapcount (which luckily provides a barrier): once page is unmapped, + * it could be truncated and page->mapping reset to NULL at any moment. + * Note also that we are relying on page_mapping(page) to set mapping + * to &swapper_space when PageSwapCache(page). */ - if ((!anon || PageSwapCache(page)) && + if (mapping && !mapping_cap_account_dirty(mapping) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* -- cgit v1.2.3