From abf09bed3cceadd809f0356065c2ada6cee90d4a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 7 Nov 2012 13:17:37 +0100 Subject: s390/mm: implement software dirty bits The s390 architecture is unique in respect to dirty page detection, it uses the change bit in the per-page storage key to track page modifications. All other architectures track dirty bits by means of page table entries. This property of s390 has caused numerous problems in the past, e.g. see git commit ef5d437f71afdf4a "mm: fix XFS oops due to dirty pages without buffers on s390". To avoid future issues in regard to per-page dirty bits convert s390 to a fault based software dirty bit detection mechanism. All user page table entries which are marked as clean will be hardware read-only, even if the pte is supposed to be writable. A write by the user process will trigger a protection fault which will cause the user pte to be marked as dirty and the hardware read-only bit is removed. With this change the dirty bit in the storage key is irrelevant for Linux as a host, but the storage key is still required for KVM guests. The effect is that page_test_and_clear_dirty and the related code can be removed. The referenced bit in the storage key is still used by the page_test_and_clear_young primitive to provide page age information. For page cache pages of mappings with mapping_cap_account_dirty there will not be any change in behavior as the dirty bit tracking already uses read-only ptes to control the amount of dirty pages. Only for swap cache pages and pages of mappings without mapping_cap_account_dirty there can be additional protection faults. To avoid an excessive number of additional faults the mk_pte primitive checks for PageDirty if the pgprot value allows for writes and pre-dirties the pte. That avoids all additional faults for tmpfs and shmem pages until these pages are added to the swap cache. Signed-off-by: Martin Schwidefsky --- mm/rmap.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'mm/rmap.c') diff --git a/mm/rmap.c b/mm/rmap.c index 2c78f8cadc95..3d38edffda41 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1126,7 +1126,6 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { - struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1143,29 +1142,6 @@ void page_remove_rmap(struct page *page) if (!atomic_add_negative(-1, &page->_mapcount)) goto out; - /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - * - * And we can skip it on file pages, so long as the filesystem - * participates in dirty tracking (note that this is not only an - * optimization but also solves problems caused by dirty flag in - * storage key getting set by a write from inside kernel); but need to - * catch shm and tmpfs and ramfs pages which have been modified since - * creation by read fault. - * - * Note that mapping must be decided above, before decrementing - * mapcount (which luckily provides a barrier): once page is unmapped, - * it could be truncated and page->mapping reset to NULL at any moment. - * Note also that we are relying on page_mapping(page) to set mapping - * to &swapper_space when PageSwapCache(page). - */ - if (mapping && !mapping_cap_account_dirty(mapping) && - page_test_and_clear_dirty(page_to_pfn(page), 1)) - set_page_dirty(page); /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. -- cgit v1.2.3 From 08b52706d505658eac0962d215ff697f898bbc13 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 22 Feb 2013 16:34:40 -0800 Subject: mm/rmap: rename anon_vma_unlock() => anon_vma_unlock_write() The comment in commit 4fc3f1d66b1e ("mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable") says: | Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(), | to make it clearer that it's an exclusive write-lock in | that case - suggested by Rik van Riel. But that commit renames only anon_vma_lock() Signed-off-by: Konstantin Khlebnikov Cc: Ingo Molnar Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- include/linux/rmap.h | 2 +- mm/huge_memory.c | 6 +++--- mm/mmap.c | 4 ++-- mm/mremap.c | 2 +- mm/rmap.c | 6 +++--- 6 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm/rmap.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1d76f8ca90f0..ee1c244a62a1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, do { \ pmd_t *____pmd = (__pmd); \ anon_vma_lock_write(__anon_vma); \ - anon_vma_unlock(__anon_vma); \ + anon_vma_unlock_write(__anon_vma); \ BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ } while (0) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c20635c527a9..6dacb93a6d94 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma) down_write(&anon_vma->root->rwsem); } -static inline void anon_vma_unlock(struct anon_vma *anon_vma) +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f40b2ce23d60..b1cc6591ed83 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1830,7 +1830,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); put_anon_vma(anon_vma); out: return ret; @@ -2322,7 +2322,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); set_pmd_at(mm, address, pmd, _pmd); spin_unlock(&mm->page_table_lock); - anon_vma_unlock(vma->anon_vma); + anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2330,7 +2330,7 @@ static void collapse_huge_page(struct mm_struct *mm, * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ - anon_vma_unlock(vma->anon_vma); + anon_vma_unlock_write(vma->anon_vma); __collapse_huge_page_copy(pte, new_page, vma, address, ptl); pte_unmap(pte); diff --git a/mm/mmap.c b/mm/mmap.c index 28416f6b8dd5..318e121affda 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -809,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end); anon_vma_interval_tree_post_update_vma(vma); if (adjust_next) anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -3017,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) BUG(); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } } diff --git a/mm/mremap.c b/mm/mremap.c index ebe27a8efa62..463a25705ac6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (anon_vma) - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } diff --git a/mm/rmap.c b/mm/rmap.c index 3d38edffda41..807c96bf0dc6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) */ if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); @@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) avc = NULL; } spin_unlock(&mm->page_table_lock); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); @@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); return 0; -- cgit v1.2.3