slub: Trigger defragmentation from memory reclaim

This patch triggers slab defragmentation from memory reclaim. The logical point for this is after slab shrinking was performed in vmscan.c. At that point the fragmentation ratio of a slab was increased because objects were freed via the LRU lists maitained for various slab caches. So we call kmem_cache_defrag() from there. shrink_slab() is called in some contexts to do global shrinking of slabs and in others to do shrinking for a particular zone. Pass the zone to shrink_slab(), so that slab_shrink() can call kmem_cache_defrag() and restrict the defragmentation to the node that is under memory pressure. The callback frequency into slab reclaim can be controlled by a new field /proc/sys/vm/slab_defrag_limit. Reviewed-by: Rik van Riel <riel@redhat.com> Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi> Signed-off-by: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
author: Christoph Lameter <cl@linux-foundation.org> 2008-08-11 08:06:25 -0700
committer: Pekka Enberg <penberg@cs.helsinki.fi> 2008-08-13 10:16:36 +0300
commit: f85619df6deb5cb2c8c23eaeea9489b32055c4a2 (patch)
tree: d353a27894793258a533b03eaa9b17080d580215 /mm
parent: a0a66769f56f4e1e0ff9040b030534ac5b85ae24 (diff)
2 files changed, 60 insertions, 8 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ff1a58e7c10..b72cb363ff8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -150,6 +150,14 @@ void unregister_shrinker(struct shrinker *shrinker)
 EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
+
+/*
+ * Trigger a call into slab defrag if the sum of the returns from
+ * shrinkers cross this value.
+ */
+int slab_defrag_limit = 1000;
+int slab_defrag_counter;
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -167,10 +175,18 @@ EXPORT_SYMBOL(unregister_shrinker);
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+			unsigned long lru_pages, struct zone *zone)
 {
 	struct shrinker *shrinker;
 	unsigned long ret = 0;
@@ -227,6 +243,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
+
+
+	/* Avoid dirtying cachelines */
+	if (!ret)
+		return 0;
+
+	/*
+	 * "ret" doesnt really contain the freed object count. The shrinkers
+	 * fake it. Gotta go with what we are getting though.
+	 *
+	 * Handling of the defrag_counter is also racy. If we get the
+	 * wrong counts then we may unnecessarily do a defrag pass or defer
+	 * one. "ret" is already faked. So this is just increasing
+	 * the already existing fuzziness to get some notion as to when
+	 * to initiate slab defrag which will hopefully be okay.
+	 */
+	if (zone) {
+		/* balance_pgdat running on a zone so we only scan one node */
+		zone->slab_defrag_counter += ret;
+		if (zone->slab_defrag_counter > slab_defrag_limit &&
+						(gfp_mask & __GFP_FS)) {
+			zone->slab_defrag_counter = 0;
+			kmem_cache_defrag(zone_to_nid(zone));
+		}
+	} else {
+		/* Direct (and thus global) reclaim. Scan all nodes */
+		slab_defrag_counter += ret;
+		if (slab_defrag_counter > slab_defrag_limit &&
+						(gfp_mask & __GFP_FS)) {
+			slab_defrag_counter = 0;
+			kmem_cache_defrag(-1);
+		}
+	}
 	return ret;
 }
 
@@ -1379,7 +1428,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 * over limit cgroups
 		 */
 		if (scan_global_lru(sc)) {
-			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL);
 			if (reclaim_state) {
 				nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
@@ -1606,7 +1655,7 @@ loop_again:
 				nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
-						lru_pages);
+						lru_pages, zone);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone_is_all_unreclaimable(zone))
@@ -1845,7 +1894,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
-		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+		shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
 		if (!reclaim_state.reclaimed_slab)
 			break;
 
@@ -1883,7 +1932,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					count_lru_pages());
+					count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
@@ -1900,7 +1949,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
@@ -2062,7 +2111,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * Note that shrink_slab will free memory on all zones and may
 		 * take a long time.
 		 */
-		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+		while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+						zone) &&
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
 				slab_reclaimable - nr_pages)
 			;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0d08e667ece..6e9a30602152 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -714,10 +714,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 #endif
 	}
 	seq_printf(m,
+		   "\n  slab_defrag_count: %lu"
 		   "\n  all_unreclaimable: %u"
 		   "\n  prev_priority:     %i"
 		   "\n  start_pfn:         %lu",
-			   zone_is_all_unreclaimable(zone),
+			zone->slab_defrag_counter,
+			zone_is_all_unreclaimable(zone),
 		   zone->prev_priority,
 		   zone->zone_start_pfn);
 	seq_putc(m, '\n');
author	Christoph Lameter <cl@linux-foundation.org>	2008-08-11 08:06:25 -0700
committer	Pekka Enberg <penberg@cs.helsinki.fi>	2008-08-13 10:16:36 +0300
commit	f85619df6deb5cb2c8c23eaeea9489b32055c4a2 (patch)
tree	d353a27894793258a533b03eaa9b17080d580215 /mm
parent	a0a66769f56f4e1e0ff9040b030534ac5b85ae24 (diff)