From d7f861b9c43aadbe384ab1382d2e76750bedc91e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:33 +0100
Subject: mm/mmu_gather: add __tlb_remove_folio_pages()

Add __tlb_remove_folio_pages(), which will remove multiple consecutive
pages that belong to the same large folio, instead of only a single page.
We'll be using this function when optimizing unmapping/zapping of large
folios that are mapped by PTEs.

We're using the remaining spare bit in an encoded_page to indicate that
the next enoced page in an array contains actually shifted "nr_pages".
Teach swap/freeing code about putting multiple folio references, and
delayed rmap handling to remove page ranges of a folio.

This extension allows for still gathering almost as many small folios as
we used to (-1, because we have to prepare for a possibly bigger next
entry), but still allows for gathering consecutive pages that belong to
the same large folio.

Note that we don't pass the folio pointer, because it is not required for
now.  Further, we don't support page_size != PAGE_SIZE, it won't be
required for simple PTE batching.

We have to provide a separate s390 implementation, but it's fairly
straight forward.

Another, more invasive and likely more expensive, approach would be to use
folio+range or a PFN range instead of page+nr_pages.  But, we should do
that consistently for the whole mmu_gather.  For now, let's keep it simple
and add "nr_pages" only.

Note that it is now possible to gather significantly more pages: In the
past, we were able to gather ~10000 pages, now we can also gather ~5000
folio fragments that span multiple pages.  A folio fragment on x86-64 can
span up to 512 pages (2 MiB THP) and on arm64 with 64k in theory 8192
pages (512 MiB THP).  Gathering more memory is not considered something we
should worry about, especially because these are already corner cases.

While we can gather more total memory, we won't free more folio fragments.
As long as page freeing time primarily only depends on the number of
involved folios, there is no effective change for !preempt configurations.
However, we'll adjust tlb_batch_pages_flush() separately to handle corner
cases where page freeing time grows proportionally with the actual memory
size.

Link: https://lkml.kernel.org/r/20240214204435.167852-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_state.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7255c01a1e4e..2f540748f7c0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -311,8 +311,19 @@ void free_page_and_swap_cache(struct page *page)
 void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 {
 	lru_add_drain();
-	for (int i = 0; i < nr; i++)
-		free_swap_cache(encoded_page_ptr(pages[i]));
+	for (int i = 0; i < nr; i++) {
+		struct page *page = encoded_page_ptr(pages[i]);
+
+		/*
+		 * Skip over the "nr_pages" entry. It's sufficient to call
+		 * free_swap_cache() only once per folio.
+		 */
+		if (unlikely(encoded_page_flags(pages[i]) &
+			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+			i++;
+
+		free_swap_cache(page);
+	}
 	release_pages(pages, nr);
 }
 
-- 
cgit v1.2.3


From 4907e80b76af004b6af42f0d4131e23ac73bc07c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Feb 2024 17:42:50 +0000
Subject: mm: convert free_pages_and_swap_cache() to use folios_put()

Process the pages in batch-sized quantities instead of all-at-once.

Link: https://lkml.kernel.org/r/20240227174254.710559-17-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_state.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2f540748f7c0..96b5c585f047 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/swapops.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/migrate.h>
@@ -310,21 +311,25 @@ void free_page_and_swap_cache(struct page *page)
  */
 void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 {
+	struct folio_batch folios;
+	unsigned int refs[PAGEVEC_SIZE];
+
 	lru_add_drain();
+	folio_batch_init(&folios);
 	for (int i = 0; i < nr; i++) {
-		struct page *page = encoded_page_ptr(pages[i]);
+		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
 
-		/*
-		 * Skip over the "nr_pages" entry. It's sufficient to call
-		 * free_swap_cache() only once per folio.
-		 */
+		free_swap_cache(&folio->page);
+		refs[folios.nr] = 1;
 		if (unlikely(encoded_page_flags(pages[i]) &
 			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
-			i++;
+			refs[folios.nr] = encoded_nr_pages(pages[++i]);
 
-		free_swap_cache(page);
+		if (folio_batch_add(&folios, folio) == 0)
+			folios_put_refs(&folios, refs);
 	}
-	release_pages(pages, nr);
+	if (folios.nr)
+		folios_put_refs(&folios, refs);
 }
 
 static inline bool swap_use_vma_readahead(void)
-- 
cgit v1.2.3


From 63b774993dd02b17127cb404b7362fc436632995 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 27 Feb 2024 17:42:52 +0000
Subject: mm: convert free_swap_cache() to take a folio

All but one caller already has a folio, so convert
free_page_and_swap_cache() to have a folio and remove the call to
page_folio().

Link: https://lkml.kernel.org/r/20240227174254.710559-19-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  8 ++++----
 mm/khugepaged.c      |  2 +-
 mm/memory.c          |  2 +-
 mm/swap_state.c      | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8d28f6091a32..1ad6f63d1a52 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -447,9 +447,9 @@ static inline unsigned long total_swapcache_pages(void)
 	return global_node_page_state(NR_SWAPCACHE);
 }
 
-extern void free_swap_cache(struct page *page);
-extern void free_page_and_swap_cache(struct page *);
-extern void free_pages_and_swap_cache(struct encoded_page **, int);
+void free_swap_cache(struct folio *folio);
+void free_page_and_swap_cache(struct page *);
+void free_pages_and_swap_cache(struct encoded_page **, int);
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
@@ -531,7 +531,7 @@ static inline void put_swap_device(struct swap_info_struct *si)
 /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
 #define free_swap_and_cache(e) is_pfn_swap_entry(e)
 
-static inline void free_swap_cache(struct page *page)
+static inline void free_swap_cache(struct folio *folio)
 {
 }
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b9223e803262..38830174608f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -731,7 +731,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 		node_stat_sub_folio(src, NR_ISOLATED_ANON +
 				folio_is_file_lru(src));
 		folio_unlock(src);
-		free_swap_cache(&src->page);
+		free_swap_cache(src);
 		folio_putback_lru(src);
 	}
 }
diff --git a/mm/memory.c b/mm/memory.c
index 1c45b6a42a1b..75e0abda8642 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3452,7 +3452,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		folio_put(new_folio);
 	if (old_folio) {
 		if (page_copied)
-			free_swap_cache(&old_folio->page);
+			free_swap_cache(old_folio);
 		folio_put(old_folio);
 	}
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 96b5c585f047..bfc7e8c58a6d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -283,10 +283,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
  * folio_free_swap() _with_ the lock.
  * 					- Marcelo
  */
-void free_swap_cache(struct page *page)
+void free_swap_cache(struct folio *folio)
 {
-	struct folio *folio = page_folio(page);
-
 	if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
 	    folio_trylock(folio)) {
 		folio_free_swap(folio);
@@ -300,9 +298,11 @@ void free_swap_cache(struct page *page)
  */
 void free_page_and_swap_cache(struct page *page)
 {
-	free_swap_cache(page);
+	struct folio *folio = page_folio(page);
+
+	free_swap_cache(folio);
 	if (!is_huge_zero_page(page))
-		put_page(page);
+		folio_put(folio);
 }
 
 /*
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 	for (int i = 0; i < nr; i++) {
 		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
 
-		free_swap_cache(&folio->page);
+		free_swap_cache(folio);
 		refs[folios.nr] = 1;
 		if (unlikely(encoded_page_flags(pages[i]) &
 			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
-- 
cgit v1.2.3