From cbc91f71b51b8335f1fc7ccfca8011f31a717367 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 11 Apr 2012 16:05:27 +0530
Subject: uprobes/core: Decrement uprobe count before the pages are unmapped

Uprobes has a callback (uprobe_munmap()) in the unmap path to
maintain the uprobes count.

In the exit path this callback gets called in unlink_file_vma().
However by the time unlink_file_vma() is called, the pages would
have been unmapped (in unmap_vmas()) and the task->rss_stat counts
accounted (in zap_pte_range()).

If the exiting process has probepoints, uprobe_munmap() checks if
the breakpoint instruction was around before decrementing the probe
count.

This results in a file backed page being reread by uprobe_munmap()
and hence it does not find the breakpoint.

This patch fixes this problem by moving the callback to
unmap_single_vma(). Since unmap_single_vma() may not unmap the
complete vma, add start and end parameters to uprobe_munmap().

This bug became apparent courtesy of commit c3f0327f8e9d
("mm: add rss counters consistency check").

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120411103527.23245.9835.sendpatchset@srdronam.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 mm/memory.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..bf8b4035277d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1307,6 +1307,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 	if (end <= vma->vm_start)
 		return;
 
+	if (vma->vm_file)
+		uprobe_munmap(vma, start, end);
+
 	if (vma->vm_flags & VM_ACCOUNT)
 		*nr_accounted += (end - start) >> PAGE_SHIFT;
 
-- 
cgit v1.2.3


From 7e027b14d53e9729f823ba8652095d1e309aa8e9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 May 2012 13:43:15 -0700
Subject: vm: simplify unmap_vmas() calling convention

None of the callers want to pass in 'zap_details', and it doesn't even
make sense for the case of actually unmapping vma's.  So remove the
argument, and clean up the interface.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 +--
 mm/memory.c        | 18 +++++++++---------
 mm/mmap.c          |  4 ++--
 3 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'mm/memory.c')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 74aa71bea1e4..0aeded3a2f7c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -898,8 +898,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
 void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *);
+		unsigned long end_addr, unsigned long *nr_accounted);
 
 /**
  * mm_walk - callbacks for walk_page_range
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..f7b6c9859796 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1340,7 +1340,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
- * @details: details of nonlinear truncation or shared cache invalidation
  *
  * Unmap all pages in the vma list.
  *
@@ -1355,15 +1354,13 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  */
 void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *details)
+		unsigned long end_addr, unsigned long *nr_accounted)
 {
 	struct mm_struct *mm = vma->vm_mm;
 
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
-		unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
-				 details);
+		unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, NULL);
 	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
 }
 
@@ -1376,19 +1373,22 @@ void unmap_vmas(struct mmu_gather *tlb,
  *
  * Caller must protect the VMA list
  */
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
-	unsigned long end = address + size;
+	unsigned long end = start + size;
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-	tlb_finish_mmu(&tlb, address, end);
+	mmu_notifier_invalidate_range_start(mm, start, end);
+	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+		unmap_single_vma(&tlb, vma, start, end, &nr_accounted, details);
+	mmu_notifier_invalidate_range_end(mm, start, end);
+	tlb_finish_mmu(&tlb, start, end);
 }
 
 /**
diff --git a/mm/mmap.c b/mm/mmap.c
index 848ef52d9603..58806106fab6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1917,7 +1917,7 @@ static void unmap_region(struct mm_struct *mm,
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
 				 next ? next->vm_start : 0);
@@ -2305,7 +2305,7 @@ void exit_mmap(struct mm_struct *mm)
 	tlb_gather_mmu(&tlb, mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, 0, -1, &nr_accounted);
 	vm_unacct_memory(nr_accounted);
 
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-- 
cgit v1.2.3


From 4f74d2c8e827af12596f153a564c868bf6dbe3dd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 May 2012 13:54:06 -0700
Subject: vm: remove 'nr_accounted' calculations from the unmap_vmas()
 interfaces

The VM accounting makes no sense at this level, and half of the callers
didn't ever actually use the end result.  The only time we want to
unaccount the memory is when we actually remove the vma, so do the
accounting at that point instead.

This simplifies the interfaces (no need to pass down that silly page
counter to functions that really don't care), and also makes it much
more obvious what is actually going on: we do vm_[un]acct_memory() when
adding or removing the vma, not on random page walking.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  5 ++---
 mm/memory.c        | 16 +++++-----------
 mm/mmap.c          | 18 ++++++++++++------
 3 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'mm/memory.c')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0aeded3a2f7c..7d5c37f24c63 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -896,9 +896,8 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-void unmap_vmas(struct mmu_gather *tlb,
-		struct vm_area_struct *start_vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long *nr_accounted);
+void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+		unsigned long start, unsigned long end);
 
 /**
  * mm_walk - callbacks for walk_page_range
diff --git a/mm/memory.c b/mm/memory.c
index f7b6c9859796..1e77da6d82c1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1295,7 +1295,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
 
 static void unmap_single_vma(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long *nr_accounted,
+		unsigned long end_addr,
 		struct zap_details *details)
 {
 	unsigned long start = max(vma->vm_start, start_addr);
@@ -1307,9 +1307,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 	if (end <= vma->vm_start)
 		return;
 
-	if (vma->vm_flags & VM_ACCOUNT)
-		*nr_accounted += (end - start) >> PAGE_SHIFT;
-
 	if (unlikely(is_pfn_mapping(vma)))
 		untrack_pfn_vma(vma, 0, 0);
 
@@ -1339,7 +1336,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
- * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
  *
  * Unmap all pages in the vma list.
  *
@@ -1354,13 +1350,13 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  */
 void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long *nr_accounted)
+		unsigned long end_addr)
 {
 	struct mm_struct *mm = vma->vm_mm;
 
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
-		unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, NULL);
+		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
 	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
 }
 
@@ -1379,14 +1375,13 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
 	unsigned long end = start + size;
-	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, start, end);
 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
-		unmap_single_vma(&tlb, vma, start, end, &nr_accounted, details);
+		unmap_single_vma(&tlb, vma, start, end, details);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	tlb_finish_mmu(&tlb, start, end);
 }
@@ -1406,13 +1401,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
 	unsigned long end = address + size;
-	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, address, end);
-	unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
+	unmap_single_vma(&tlb, vma, address, end, details);
 	mmu_notifier_invalidate_range_end(mm, address, end);
 	tlb_finish_mmu(&tlb, address, end);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 58806106fab6..69a1889f3790 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1889,15 +1889,20 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
+	unsigned long nr_accounted = 0;
+
 	/* Update high watermark before we lower total_vm */
 	update_hiwater_vm(mm);
 	do {
 		long nrpages = vma_pages(vma);
 
+		if (vma->vm_flags & VM_ACCOUNT)
+			nr_accounted += nrpages;
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
 	} while (vma);
+	vm_unacct_memory(nr_accounted);
 	validate_mm(mm);
 }
 
@@ -1912,13 +1917,11 @@ static void unmap_region(struct mm_struct *mm,
 {
 	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
 	struct mmu_gather tlb;
-	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted);
-	vm_unacct_memory(nr_accounted);
+	unmap_vmas(&tlb, vma, start, end);
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
 				 next ? next->vm_start : 0);
 	tlb_finish_mmu(&tlb, start, end);
@@ -2305,8 +2308,7 @@ void exit_mmap(struct mm_struct *mm)
 	tlb_gather_mmu(&tlb, mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	unmap_vmas(&tlb, vma, 0, -1, &nr_accounted);
-	vm_unacct_memory(nr_accounted);
+	unmap_vmas(&tlb, vma, 0, -1);
 
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(&tlb, 0, -1);
@@ -2315,8 +2317,12 @@ void exit_mmap(struct mm_struct *mm)
 	 * Walk the list again, actually closing and freeing it,
 	 * with preemption enabled, without holding any MM locks.
 	 */
-	while (vma)
+	while (vma) {
+		if (vma->vm_flags & VM_ACCOUNT)
+			nr_accounted += vma_pages(vma);
 		vma = remove_vma(vma);
+	}
+	vm_unacct_memory(nr_accounted);
 
 	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
-- 
cgit v1.2.3


From e709ffd6169ccd259eb5874e853303e91e94e829 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 29 May 2012 15:06:18 -0700
Subject: mm: remove swap token code

The swap token code no longer fits in with the current VM model.  It
does not play well with cgroups or the better NUMA placement code in
development, since we have only one swap token globally.

It also has the potential to mess with scalability of the system, by
increasing the number of non-reclaimable pages on the active and
inactive anon LRU lists.

Last but not least, the swap token code has been broken for a year
without complaints, as reported by Konstantin Khlebnikov.  This suggests
we no longer have much use for it.

The days of sub-1G memory systems with heavy use of swap are over.  If
we ever need thrashing reducing code in the future, we will have to
implement something that does scale.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Bob Picco <bpicco@meloft.net>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h      |  11 ---
 include/linux/swap.h          |  35 ----------
 include/trace/events/vmscan.h |  82 ----------------------
 kernel/fork.c                 |   9 ---
 mm/Makefile                   |   2 +-
 mm/memcontrol.c               |   1 -
 mm/memory.c                   |   2 +-
 mm/rmap.c                     |   6 --
 mm/thrash.c                   | 155 ------------------------------------------
 mm/vmscan.c                   |   6 --
 10 files changed, 2 insertions(+), 307 deletions(-)
 delete mode 100644 mm/thrash.c

(limited to 'mm/memory.c')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26574c726121..dad95bdd06d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -345,17 +345,6 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
-	/* Swap token stuff */
-	/*
-	 * Last value of global fault stamp as seen by this process.
-	 * In other words, this value gives an indication of how long
-	 * it has been since this task got the token.
-	 * Look at mm/thrash.c
-	 */
-	unsigned int faultstamp;
-	unsigned int token_priority;
-	unsigned int last_interval;
-
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b1fd5c7925fe..bc3073ce95cc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -355,23 +355,6 @@ extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
-/* linux/mm/thrash.c */
-extern struct mm_struct *swap_token_mm;
-extern void grab_swap_token(struct mm_struct *);
-extern void __put_swap_token(struct mm_struct *);
-extern void disable_swap_token(struct mem_cgroup *memcg);
-
-static inline int has_swap_token(struct mm_struct *mm)
-{
-	return (mm == swap_token_mm);
-}
-
-static inline void put_swap_token(struct mm_struct *mm)
-{
-	if (has_swap_token(mm))
-		__put_swap_token(mm);
-}
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
@@ -476,24 +459,6 @@ static inline swp_entry_t get_swap_page(void)
 	return entry;
 }
 
-/* linux/mm/thrash.c */
-static inline void put_swap_token(struct mm_struct *mm)
-{
-}
-
-static inline void grab_swap_token(struct mm_struct *mm)
-{
-}
-
-static inline int has_swap_token(struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void disable_swap_token(struct mem_cgroup *memcg)
-{
-}
-
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f64560e204bc..572195459d58 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -395,88 +395,6 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
-TRACE_EVENT(replace_swap_token,
-	TP_PROTO(struct mm_struct *old_mm,
-		 struct mm_struct *new_mm),
-
-	TP_ARGS(old_mm, new_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*,	old_mm)
-		__field(unsigned int,		old_prio)
-		__field(struct mm_struct*,	new_mm)
-		__field(unsigned int,		new_prio)
-	),
-
-	TP_fast_assign(
-		__entry->old_mm   = old_mm;
-		__entry->old_prio = old_mm ? old_mm->token_priority : 0;
-		__entry->new_mm   = new_mm;
-		__entry->new_prio = new_mm->token_priority;
-	),
-
-	TP_printk("old_token_mm=%p old_prio=%u new_token_mm=%p new_prio=%u",
-		  __entry->old_mm, __entry->old_prio,
-		  __entry->new_mm, __entry->new_prio)
-);
-
-DECLARE_EVENT_CLASS(put_swap_token_template,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-
-	TP_ARGS(swap_token_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, swap_token_mm)
-	),
-
-	TP_fast_assign(
-		__entry->swap_token_mm = swap_token_mm;
-	),
-
-	TP_printk("token_mm=%p", __entry->swap_token_mm)
-);
-
-DEFINE_EVENT(put_swap_token_template, put_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm)
-);
-
-DEFINE_EVENT_CONDITION(put_swap_token_template, disable_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm),
-	TP_CONDITION(swap_token_mm != NULL)
-);
-
-TRACE_EVENT_CONDITION(update_swap_token_priority,
-	TP_PROTO(struct mm_struct *mm,
-		 unsigned int old_prio,
-		 struct mm_struct *swap_token_mm),
-
-	TP_ARGS(mm, old_prio, swap_token_mm),
-
-	TP_CONDITION(mm->token_priority != old_prio),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, mm)
-		__field(unsigned int, old_prio)
-		__field(unsigned int, new_prio)
-		__field(struct mm_struct*, swap_token_mm)
-		__field(unsigned int, swap_token_prio)
-	),
-
-	TP_fast_assign(
-		__entry->mm		= mm;
-		__entry->old_prio	= old_prio;
-		__entry->new_prio	= mm->token_priority;
-		__entry->swap_token_mm	= swap_token_mm;
-		__entry->swap_token_prio = swap_token_mm ? swap_token_mm->token_priority : 0;
-	),
-
-	TP_printk("mm=%p old_prio=%u new_prio=%u swap_token_mm=%p token_prio=%u",
-		  __entry->mm, __entry->old_prio, __entry->new_prio,
-		  __entry->swap_token_mm, __entry->swap_token_prio)
-);
-
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/kernel/fork.c b/kernel/fork.c
index 47b4e4f379f9..5b13eea2e757 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -614,7 +614,6 @@ void mmput(struct mm_struct *mm)
 			list_del(&mm->mmlist);
 			spin_unlock(&mmlist_lock);
 		}
-		put_swap_token(mm);
 		if (mm->binfmt)
 			module_put(mm->binfmt->module);
 		mmdrop(mm);
@@ -831,10 +830,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	memcpy(mm, oldmm, sizeof(*mm));
 	mm_init_cpumask(mm);
 
-	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
-	mm->last_interval = 0;
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	mm->pmd_huge_pte = NULL;
 #endif
@@ -913,10 +908,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 		goto fail_nomem;
 
 good_mm:
-	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
-	mm->last_interval = 0;
-
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
diff --git a/mm/Makefile b/mm/Makefile
index 8aada89efbbb..ccecbf9818f5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,7 +25,7 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
-obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f342778a0c0a..92675fe8a2ef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5598,7 +5598,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
-		put_swap_token(mm);
 		mmput(mm);
 	}
 	if (mc.to)
diff --git a/mm/memory.c b/mm/memory.c
index e40f6759ba98..2bf9e110437c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
@@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	locked = lock_page_or_retry(page, mm, flags);
+
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	if (!locked) {
 		ret |= VM_FAULT_RETRY;
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	/* Pretend the page is referenced if the task has the
-	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && has_swap_token(mm) &&
-			rwsem_is_locked(&mm->mmap_sem))
-		referenced++;
-
 	(*mapcount)--;
 
 	if (referenced)
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
- *
- * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
- * Improved algorithm to pass token:
- * Each task has a priority which is incremented if it contended
- * for the token in an interval less than its previous attempt.
- * If the token is acquired, that task's priority is boosted to prevent
- * the token from bouncing around too often and to let the task make
- * some progress in its execution.
- */
-
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/memcontrol.h>
-
-#include <trace/events/vmscan.h>
-
-#define TOKEN_AGING_INTERVAL	(0xFF)
-
-static DEFINE_SPINLOCK(swap_token_lock);
-struct mm_struct *swap_token_mm;
-static struct mem_cgroup *swap_token_memcg;
-
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = try_get_mem_cgroup_from_mm(mm);
-	if (memcg)
-		css_put(mem_cgroup_css(memcg));
-
-	return memcg;
-}
-#else
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-	return NULL;
-}
-#endif
-
-void grab_swap_token(struct mm_struct *mm)
-{
-	int current_interval;
-	unsigned int old_prio = mm->token_priority;
-	static unsigned int global_faults;
-	static unsigned int last_aging;
-
-	global_faults++;
-
-	current_interval = global_faults - mm->faultstamp;
-
-	if (!spin_trylock(&swap_token_lock))
-		return;
-
-	/* First come first served */
-	if (!swap_token_mm)
-		goto replace_token;
-
-	/*
-	 * Usually, we don't need priority aging because long interval faults
-	 * makes priority decrease quickly. But there is one exception. If the
-	 * token owner task is sleeping, it never make long interval faults.
-	 * Thus, we need a priority aging mechanism instead. The requirements
-	 * of priority aging are
-	 *  1) An aging interval is reasonable enough long. Too short aging
-	 *     interval makes quick swap token lost and decrease performance.
-	 *  2) The swap token owner task have to get priority aging even if
-	 *     it's under sleep.
-	 */
-	if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
-		swap_token_mm->token_priority /= 2;
-		last_aging = global_faults;
-	}
-
-	if (mm == swap_token_mm) {
-		mm->token_priority += 2;
-		goto update_priority;
-	}
-
-	if (current_interval < mm->last_interval)
-		mm->token_priority++;
-	else {
-		if (likely(mm->token_priority > 0))
-			mm->token_priority--;
-	}
-
-	/* Check if we deserve the token */
-	if (mm->token_priority > swap_token_mm->token_priority)
-		goto replace_token;
-
-update_priority:
-	trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
-
-out:
-	mm->faultstamp = global_faults;
-	mm->last_interval = current_interval;
-	spin_unlock(&swap_token_lock);
-	return;
-
-replace_token:
-	mm->token_priority += 2;
-	trace_replace_swap_token(swap_token_mm, mm);
-	swap_token_mm = mm;
-	swap_token_memcg = swap_token_memcg_from_mm(mm);
-	last_aging = global_faults;
-	goto out;
-}
-
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
-	spin_lock(&swap_token_lock);
-	if (likely(mm == swap_token_mm)) {
-		trace_put_swap_token(swap_token_mm);
-		swap_token_mm = NULL;
-		swap_token_memcg = NULL;
-	}
-	spin_unlock(&swap_token_lock);
-}
-
-static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
-{
-	if (!a)
-		return true;
-	if (!b)
-		return true;
-	if (a == b)
-		return true;
-	return false;
-}
-
-void disable_swap_token(struct mem_cgroup *memcg)
-{
-	/* memcg reclaim don't disable unrelated mm token. */
-	if (match_memcg(memcg, swap_token_memcg)) {
-		spin_lock(&swap_token_lock);
-		if (match_memcg(memcg, swap_token_memcg)) {
-			trace_disable_swap_token(swap_token_mm);
-			swap_token_mm = NULL;
-			swap_token_memcg = NULL;
-		}
-		spin_unlock(&swap_token_lock);
-	}
-}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33dc256033b5..ca46080bb074 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2352,8 +2352,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc->nr_scanned = 0;
-		if (!priority)
-			disable_swap_token(sc->target_mem_cgroup);
 		aborted_reclaim = shrink_zones(priority, zonelist, sc);
 
 		/*
@@ -2704,10 +2702,6 @@ loop_again:
 		unsigned long lru_pages = 0;
 		int has_under_min_watermark_zone = 0;
 
-		/* The swap token gets in the way of swapout... */
-		if (!priority)
-			disable_swap_token(NULL);
-
 		all_zones_ok = 1;
 		balanced = 0;
 
-- 
cgit v1.2.3


From 1f1d06c34f7675026326cd9f39ff91e4555cf355 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:23 -0700
Subject: thp, memcg: split hugepage for memcg oom on cow

On COW, a new hugepage is allocated and charged to the memcg.  If the
system is oom or the charge to the memcg fails, however, the fault
handler will return VM_FAULT_OOM which results in an oom kill.

Instead, it's possible to fallback to splitting the hugepage so that the
COW results only in an order-0 page being allocated and charged to the
memcg which has a higher liklihood to succeed.  This is expensive
because the hugepage must be split in the page fault handler, but it is
much better than unnecessarily oom killing a process.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c |  3 +++
 mm/memory.c      | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d7d7165156ca..edfeb8cb23df 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -952,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
+		if (ret & VM_FAULT_OOM)
+			split_huge_page(page);
 		put_page(page);
 		goto out;
 	}
@@ -959,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
+		split_huge_page(page);
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
diff --git a/mm/memory.c b/mm/memory.c
index 2bf9e110437c..1b7dc662bf9f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
+retry:
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
@@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
+		int ret;
+
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			if (flags & FAULT_FLAG_WRITE &&
 			    !pmd_write(orig_pmd) &&
-			    !pmd_trans_splitting(orig_pmd))
-				return do_huge_pmd_wp_page(mm, vma, address,
-							   pmd, orig_pmd);
+			    !pmd_trans_splitting(orig_pmd)) {
+				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+							  orig_pmd);
+				/*
+				 * If COW results in an oom, the huge pmd will
+				 * have been split, so retry the fault on the
+				 * pte for a smaller charge.
+				 */
+				if (unlikely(ret & VM_FAULT_OOM))
+					goto retry;
+				return ret;
+			}
 			return 0;
 		}
 	}
-- 
cgit v1.2.3