From 9b83a6a8523a8a96b6353174b193c5c93e16c6c3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 28 Feb 2007 20:11:03 -0800
Subject: [PATCH] mm/{,tiny-}shmem.c cleanups

shmem_{nopage,mmap} are no longer used in ipc/shm.c

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 882053031aa0..fcb07882c8e0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1228,7 +1228,8 @@ failed:
 	return error;
 }
 
-struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
+static struct page *shmem_nopage(struct vm_area_struct *vma,
+				 unsigned long address, int *type)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct page *page = NULL;
@@ -1335,7 +1336,7 @@ out_nomem:
 	return retval;
 }
 
-int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-- 
cgit v1.2.3


From 759b9775c25f5e69aaea8a75c3914019e2dc5539 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 5 Mar 2007 00:30:28 -0800
Subject: [PATCH] shmem and simple const super_operations

shmem's super_operations were missed from the recent const-ification;
and simple_fill_super()'s, which can share with get_sb_pseudo()'s.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/libfs.c | 10 ++++++----
 mm/shmem.c |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/fs/libfs.c b/fs/libfs.c
index cf79196535ec..d93842d3c0a0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -190,6 +190,10 @@ const struct inode_operations simple_dir_inode_operations = {
 	.lookup		= simple_lookup,
 };
 
+static const struct super_operations simple_super_operations = {
+	.statfs		= simple_statfs,
+};
+
 /*
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
@@ -199,7 +203,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct vfsmount *mnt)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
-	static const struct super_operations default_ops = {.statfs = simple_statfs};
 	struct dentry *dentry;
 	struct inode *root;
 	struct qstr d_name = {.name = name, .len = strlen(name)};
@@ -212,7 +215,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = magic;
-	s->s_op = ops ? ops : &default_ops;
+	s->s_op = ops ? ops : &simple_super_operations;
 	s->s_time_gran = 1;
 	root = new_inode(s);
 	if (!root)
@@ -359,7 +362,6 @@ int simple_commit_write(struct file *file, struct page *page,
 
 int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
 {
-	static struct super_operations s_ops = {.statfs = simple_statfs};
 	struct inode *inode;
 	struct dentry *root;
 	struct dentry *dentry;
@@ -368,7 +370,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 	s->s_blocksize = PAGE_CACHE_SIZE;
 	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	s->s_magic = magic;
-	s->s_op = &s_ops;
+	s->s_op = &simple_super_operations;
 	s->s_time_gran = 1;
 
 	inode = new_inode(s);
diff --git a/mm/shmem.c b/mm/shmem.c
index fcb07882c8e0..b8c429a2d271 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -175,7 +175,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
 }
 
-static struct super_operations shmem_ops;
+static const struct super_operations shmem_ops;
 static const struct address_space_operations shmem_aops;
 static const struct file_operations shmem_file_operations;
 static const struct inode_operations shmem_inode_operations;
@@ -2383,7 +2383,7 @@ static const struct inode_operations shmem_special_inode_operations = {
 #endif
 };
 
-static struct super_operations shmem_ops = {
+static const struct super_operations shmem_ops = {
 	.alloc_inode	= shmem_alloc_inode,
 	.destroy_inode	= shmem_destroy_inode,
 #ifdef CONFIG_TMPFS
-- 
cgit v1.2.3


From a2646d1e6c8d2239d8054a7d342eb9775a1d273a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 29 Mar 2007 01:20:35 -0700
Subject: [PATCH] holepunch: fix shmem_truncate_range punching too far

Miklos Szeredi observes BUG_ON(!entry) in shmem_writepage() triggered in rare
circumstances, because shmem_truncate_range() erroneously removes partially
truncated directory pages at the end of the range: later reclaim on pages
pointing to these removed directories triggers the BUG.  Indeed, and it can
also cause data loss beyond the hole.

Fix this as in the patch proposed by Miklos, but distinguish between "limit"
(how far we need to search: ignore truncation's next_index optimization in the
holepunch case - if there are races it's more consistent to act on the whole
range specified) and "upper_limit" (how far we can free directory pages:
generally we must be careful to keep partially punched pages, but can relax at
end of file - i_size being held stable by i_mutex).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Miklos Szeredi <mszeredi@suse.cs>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index b8c429a2d271..1077b1d903d2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -481,7 +481,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 	long nr_swaps_freed = 0;
 	int offset;
 	int freed;
-	int punch_hole = 0;
+	int punch_hole;
+	unsigned long upper_limit;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -492,11 +493,18 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 	info->flags |= SHMEM_TRUNCATE;
 	if (likely(end == (loff_t) -1)) {
 		limit = info->next_index;
+		upper_limit = SHMEM_MAX_INDEX;
 		info->next_index = idx;
+		punch_hole = 0;
 	} else {
-		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-		if (limit > info->next_index)
-			limit = info->next_index;
+		if (end + 1 >= inode->i_size) {	/* we may free a little more */
+			limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
+							PAGE_CACHE_SHIFT;
+			upper_limit = SHMEM_MAX_INDEX;
+		} else {
+			limit = (end + 1) >> PAGE_CACHE_SHIFT;
+			upper_limit = limit;
+		}
 		punch_hole = 1;
 	}
 
@@ -520,10 +528,10 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 	 * If there are no indirect blocks or we are punching a hole
 	 * below indirect blocks, nothing to be done.
 	 */
-	if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
+	if (!topdir || limit <= SHMEM_NR_DIRECT)
 		goto done2;
 
-	BUG_ON(limit <= SHMEM_NR_DIRECT);
+	upper_limit -= SHMEM_NR_DIRECT;
 	limit -= SHMEM_NR_DIRECT;
 	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
 	offset = idx % ENTRIES_PER_PAGE;
@@ -543,7 +551,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 		if (*dir) {
 			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
 				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
-			if (!diroff && !offset) {
+			if (!diroff && !offset && upper_limit >= stage) {
 				*dir = NULL;
 				nr_pages_to_free++;
 				list_add(&middir->lru, &pages_to_free);
@@ -570,9 +578,11 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 			}
 			stage = idx + ENTRIES_PER_PAGEPAGE;
 			middir = *dir;
-			*dir = NULL;
-			nr_pages_to_free++;
-			list_add(&middir->lru, &pages_to_free);
+			if (upper_limit >= stage) {
+				*dir = NULL;
+				nr_pages_to_free++;
+				list_add(&middir->lru, &pages_to_free);
+			}
 			shmem_dir_unmap(dir);
 			cond_resched();
 			dir = shmem_dir_map(middir);
@@ -598,7 +608,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 		}
 		if (offset)
 			offset = 0;
-		else if (subdir && !page_private(subdir)) {
+		else if (subdir && upper_limit - idx >= ENTRIES_PER_PAGE) {
 			dir[diroff] = NULL;
 			nr_pages_to_free++;
 			list_add(&subdir->lru, &pages_to_free);
-- 
cgit v1.2.3


From 1ae7000630e3c05b6f7e3dfc76472f1bca6c1788 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 29 Mar 2007 01:20:36 -0700
Subject: [PATCH] holepunch: fix shmem_truncate_range punch locking

Miklos Szeredi observes that during truncation of shmem page directories,
info->lock is released to improve latency (after lowering i_size and
next_index to exclude races); but this is quite wrong for holepunching, which
receives no such protection from i_size or next_index, and is left vulnerable
to races with shmem_unuse, shmem_getpage and shmem_writepage.

Hold info->lock throughout when holepunching?  No, any user could prevent
rescheduling for far too long.  Instead take info->lock just when needed: in
shmem_free_swp when removing the swap entries, and whenever removing a
directory page from the level above.  But so long as we remove before
scanning, we can safely skip taking the lock at the lower levels, except at
misaligned start and end of the hole.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 23 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 1077b1d903d2..578eceafba4a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -402,26 +402,38 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 /*
  * shmem_free_swp - free some swap entries in a directory
  *
- * @dir:   pointer to the directory
- * @edir:  pointer after last entry of the directory
+ * @dir:        pointer to the directory
+ * @edir:       pointer after last entry of the directory
+ * @punch_lock: pointer to spinlock when needed for the holepunch case
  */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
+						spinlock_t *punch_lock)
 {
+	spinlock_t *punch_unlock = NULL;
 	swp_entry_t *ptr;
 	int freed = 0;
 
 	for (ptr = dir; ptr < edir; ptr++) {
 		if (ptr->val) {
+			if (unlikely(punch_lock)) {
+				punch_unlock = punch_lock;
+				punch_lock = NULL;
+				spin_lock(punch_unlock);
+				if (!ptr->val)
+					continue;
+			}
 			free_swap_and_cache(*ptr);
 			*ptr = (swp_entry_t){0};
 			freed++;
 		}
 	}
+	if (punch_unlock)
+		spin_unlock(punch_unlock);
 	return freed;
 }
 
-static int shmem_map_and_free_swp(struct page *subdir,
-		int offset, int limit, struct page ***dir)
+static int shmem_map_and_free_swp(struct page *subdir, int offset,
+		int limit, struct page ***dir, spinlock_t *punch_lock)
 {
 	swp_entry_t *ptr;
 	int freed = 0;
@@ -431,7 +443,8 @@ static int shmem_map_and_free_swp(struct page *subdir,
 		int size = limit - offset;
 		if (size > LATENCY_LIMIT)
 			size = LATENCY_LIMIT;
-		freed += shmem_free_swp(ptr+offset, ptr+offset+size);
+		freed += shmem_free_swp(ptr+offset, ptr+offset+size,
+							punch_lock);
 		if (need_resched()) {
 			shmem_swp_unmap(ptr);
 			if (*dir) {
@@ -482,6 +495,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 	int offset;
 	int freed;
 	int punch_hole;
+	spinlock_t *needs_lock;
+	spinlock_t *punch_lock;
 	unsigned long upper_limit;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -495,6 +510,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 		limit = info->next_index;
 		upper_limit = SHMEM_MAX_INDEX;
 		info->next_index = idx;
+		needs_lock = NULL;
 		punch_hole = 0;
 	} else {
 		if (end + 1 >= inode->i_size) {	/* we may free a little more */
@@ -505,6 +521,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 			limit = (end + 1) >> PAGE_CACHE_SHIFT;
 			upper_limit = limit;
 		}
+		needs_lock = &info->lock;
 		punch_hole = 1;
 	}
 
@@ -521,7 +538,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 		size = limit;
 		if (size > SHMEM_NR_DIRECT)
 			size = SHMEM_NR_DIRECT;
-		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
+		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
 	}
 
 	/*
@@ -531,6 +548,19 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 	if (!topdir || limit <= SHMEM_NR_DIRECT)
 		goto done2;
 
+	/*
+	 * The truncation case has already dropped info->lock, and we're safe
+	 * because i_size and next_index have already been lowered, preventing
+	 * access beyond.  But in the punch_hole case, we still need to take
+	 * the lock when updating the swap directory, because there might be
+	 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
+	 * shmem_writepage.  However, whenever we find we can remove a whole
+	 * directory page (not at the misaligned start or end of the range),
+	 * we first NULLify its pointer in the level above, and then have no
+	 * need to take the lock when updating its contents: needs_lock and
+	 * punch_lock (either pointing to info->lock or NULL) manage this.
+	 */
+
 	upper_limit -= SHMEM_NR_DIRECT;
 	limit -= SHMEM_NR_DIRECT;
 	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
@@ -552,7 +582,13 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
 				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
 			if (!diroff && !offset && upper_limit >= stage) {
-				*dir = NULL;
+				if (needs_lock) {
+					spin_lock(needs_lock);
+					*dir = NULL;
+					spin_unlock(needs_lock);
+					needs_lock = NULL;
+				} else
+					*dir = NULL;
 				nr_pages_to_free++;
 				list_add(&middir->lru, &pages_to_free);
 			}
@@ -578,8 +614,16 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 			}
 			stage = idx + ENTRIES_PER_PAGEPAGE;
 			middir = *dir;
+			if (punch_hole)
+				needs_lock = &info->lock;
 			if (upper_limit >= stage) {
-				*dir = NULL;
+				if (needs_lock) {
+					spin_lock(needs_lock);
+					*dir = NULL;
+					spin_unlock(needs_lock);
+					needs_lock = NULL;
+				} else
+					*dir = NULL;
 				nr_pages_to_free++;
 				list_add(&middir->lru, &pages_to_free);
 			}
@@ -588,31 +632,37 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 			dir = shmem_dir_map(middir);
 			diroff = 0;
 		}
+		punch_lock = needs_lock;
 		subdir = dir[diroff];
-		if (subdir && page_private(subdir)) {
+		if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
+			if (needs_lock) {
+				spin_lock(needs_lock);
+				dir[diroff] = NULL;
+				spin_unlock(needs_lock);
+				punch_lock = NULL;
+			} else
+				dir[diroff] = NULL;
+			nr_pages_to_free++;
+			list_add(&subdir->lru, &pages_to_free);
+		}
+		if (subdir && page_private(subdir) /* has swap entries */) {
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
 				size = ENTRIES_PER_PAGE;
 			freed = shmem_map_and_free_swp(subdir,
-						offset, size, &dir);
+					offset, size, &dir, punch_lock);
 			if (!dir)
 				dir = shmem_dir_map(middir);
 			nr_swaps_freed += freed;
-			if (offset)
+			if (offset || punch_lock) {
 				spin_lock(&info->lock);
-			set_page_private(subdir, page_private(subdir) - freed);
-			if (offset)
+				set_page_private(subdir,
+					page_private(subdir) - freed);
 				spin_unlock(&info->lock);
-			if (!punch_hole)
-				BUG_ON(page_private(subdir) > offset);
-		}
-		if (offset)
-			offset = 0;
-		else if (subdir && upper_limit - idx >= ENTRIES_PER_PAGE) {
-			dir[diroff] = NULL;
-			nr_pages_to_free++;
-			list_add(&subdir->lru, &pages_to_free);
+			} else
+				BUG_ON(page_private(subdir) != freed);
 		}
+		offset = 0;
 	}
 done1:
 	shmem_dir_unmap(dir);
-- 
cgit v1.2.3


From 16a100190d39592d1d56ff5a0b978b20288c3427 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 29 Mar 2007 01:20:37 -0700
Subject: [PATCH] holepunch: fix disconnected pages after second truncate

shmem_truncate_range has its own truncate_inode_pages_range, to free any pages
racily instantiated while it was in progress: a SHMEM_PAGEIN flag is set when
this might have happened.  But holepunching gets no chance to clear that flag
at the start of vmtruncate_range, so it's always set (unless a truncate came
just before), so holepunch almost always does this second
truncate_inode_pages_range.

shmem holepunch has unlikely swap<->file races hereabouts whatever we do
(without a fuller rework than is fit for this release): I was going to skip
the second truncate in the punch_hole case, but Miklos points out that would
make holepunch correctness more vulnerable to swapoff.  So keep the second
truncate, but follow it by an unmap_mapping_range to eliminate the
disconnected pages (freed from pagecache while still mapped in userspace) that
it might have left behind.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 578eceafba4a..b2a35ebf071a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -674,8 +674,16 @@ done2:
 		 * generic_delete_inode did it, before we lowered next_index.
 		 * Also, though shmem_getpage checks i_size before adding to
 		 * cache, no recheck after: so fix the narrow window there too.
+		 *
+		 * Recalling truncate_inode_pages_range and unmap_mapping_range
+		 * every time for punch_hole (which never got a chance to clear
+		 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
+		 * yet hardly ever necessary: try to optimize them out later.
 		 */
 		truncate_inode_pages_range(inode->i_mapping, start, end);
+		if (punch_hole)
+			unmap_mapping_range(inode->i_mapping, start,
+							end - start, 1);
 	}
 
 	spin_lock(&info->lock);
-- 
cgit v1.2.3