From 01de85e057328ecbef36e108673b1e81059d54c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Oct 2006 19:50:36 +0200
Subject: [PATCH] Add lockless helpers for remove_suid()

Right now users have to grab i_mutex before calling remove_suid(), in the
unlikely event that a call to ->setattr() may be needed. Split up the
function in two parts:

- One to check if we need to remove suid
- One to actually remove it

The first we can call lockless.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 mm/filemap.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 3464b681f844..7c7addb9333c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1884,11 +1884,10 @@ repeat:
  *	if suid or (sgid and xgrp)
  *		remove privs
  */
-int remove_suid(struct dentry *dentry)
+int should_remove_suid(struct dentry *dentry)
 {
 	mode_t mode = dentry->d_inode->i_mode;
 	int kill = 0;
-	int result = 0;
 
 	/* suid always must be killed */
 	if (unlikely(mode & S_ISUID))
@@ -1901,13 +1900,28 @@ int remove_suid(struct dentry *dentry)
 	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
 		kill |= ATTR_KILL_SGID;
 
-	if (unlikely(kill && !capable(CAP_FSETID))) {
-		struct iattr newattrs;
+	if (unlikely(kill && !capable(CAP_FSETID)))
+		return kill;
 
-		newattrs.ia_valid = ATTR_FORCE | kill;
-		result = notify_change(dentry, &newattrs);
-	}
-	return result;
+	return 0;
+}
+
+int __remove_suid(struct dentry *dentry, int kill)
+{
+	struct iattr newattrs;
+
+	newattrs.ia_valid = ATTR_FORCE | kill;
+	return notify_change(dentry, &newattrs);
+}
+
+int remove_suid(struct dentry *dentry)
+{
+	int kill = should_remove_suid(dentry);
+
+	if (unlikely(kill))
+		return __remove_suid(dentry, kill);
+
+	return 0;
 }
 EXPORT_SYMBOL(remove_suid);
 
-- 
cgit v1.2.3


From fb5527e68d495650a7658fec9a7246bf922db212 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 19 Oct 2006 23:28:13 -0700
Subject: [PATCH] direct-io: sync and invalidate file region when falling back
 to buffered write

When direct-io falls back to buffered write, it will just leave the dirty data
floating about in pagecache, pending regular writeback.

But normal direct-io semantics are that IO is synchronous, and that it leaves
no pagecache behind.

So change the fallback-to-buffered-write code to sync the file region and to
then strip away the pagecache, just as a regular direct-io write would do.

Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 3464b681f844..57faa8d12099 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2222,7 +2222,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
-	const struct address_space * mapping = file->f_mapping;
+	struct address_space * mapping = file->f_mapping;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
@@ -2275,8 +2275,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
-		written = generic_file_direct_write(iocb, iov,
-				&nr_segs, pos, ppos, count, ocount);
+		loff_t endbyte;
+		ssize_t written_buffered;
+
+		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+							ppos, count, ocount);
 		if (written < 0 || written == count)
 			goto out;
 		/*
@@ -2285,10 +2288,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 		 */
 		pos += written;
 		count -= written;
-	}
+		written_buffered = generic_file_buffered_write(iocb, iov,
+						nr_segs, pos, ppos, count,
+						written);
+		/*
+		 * If generic_file_buffered_write() retuned a synchronous error
+		 * then we want to return the number of bytes which were
+		 * direct-written, or the error code if that was zero.  Note
+		 * that this differs from normal direct-io semantics, which
+		 * will return -EFOO even if some bytes were written.
+		 */
+		if (written_buffered < 0) {
+			err = written_buffered;
+			goto out;
+		}
 
-	written = generic_file_buffered_write(iocb, iov, nr_segs,
-			pos, ppos, count, written);
+		/*
+		 * We need to ensure that the page cache pages are written to
+		 * disk and invalidated to preserve the expected O_DIRECT
+		 * semantics.
+		 */
+		endbyte = pos + written_buffered - written - 1;
+		err = do_sync_file_range(file, pos, endbyte,
+					 SYNC_FILE_RANGE_WAIT_BEFORE|
+					 SYNC_FILE_RANGE_WRITE|
+					 SYNC_FILE_RANGE_WAIT_AFTER);
+		if (err == 0) {
+			written = written_buffered;
+			invalidate_mapping_pages(mapping,
+						 pos >> PAGE_CACHE_SHIFT,
+						 endbyte >> PAGE_CACHE_SHIFT);
+		} else {
+			/*
+			 * We don't know how much we wrote, so just return
+			 * the number of bytes which were direct-written
+			 */
+		}
+	} else {
+		written = generic_file_buffered_write(iocb, iov, nr_segs,
+				pos, ppos, count, written);
+	}
 out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
-- 
cgit v1.2.3


From 82591e6ea234762eeaa8b2337fe060ed438c18dc Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Oct 2006 23:29:10 -0700
Subject: [PATCH] mm: more commenting on lock ordering

Clarify lockorder comments now that sys_msync dropps mmap_sem before
calling do_fsync.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c |  4 ++--
 mm/rmap.c    | 36 +++++++++++++++---------------------
 2 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 57faa8d12099..8558732e85c1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *  ->mmap_sem
  *    ->lock_page		(access_process_vm)
  *
- *  ->mmap_sem
- *    ->i_mutex			(msync)
+ *  ->i_mutex			(generic_file_buffered_write)
+ *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
  *
  *  ->i_mutex
  *    ->i_alloc_sem             (various)
diff --git a/mm/rmap.c b/mm/rmap.c
index a9136d8b7577..d8a842a586db 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,27 +21,21 @@
  * Lock ordering in mm:
  *
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
- *   inode->i_alloc_sem
- *
- * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
- * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
- * taken together; in truncation, i_mutex is taken outermost.
- *
- * mm->mmap_sem
- *   page->flags PG_locked (lock_page)
- *     mapping->i_mmap_lock
- *       anon_vma->lock
- *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- *           swap_lock (in swap_duplicate, swap_info_get)
- *             mmlist_lock (in mmput, drain_mmlist and others)
- *             mapping->private_lock (in __set_page_dirty_buffers)
- *             inode_lock (in set_page_dirty's __mark_inode_dirty)
- *               sb_lock (within inode_lock in fs/fs-writeback.c)
- *               mapping->tree_lock (widely used, in set_page_dirty,
- *                         in arch-dependent flush_dcache_mmap_lock,
- *                         within inode_lock in __sync_single_inode)
+ *   inode->i_alloc_sem (vmtruncate_range)
+ *   mm->mmap_sem
+ *     page->flags PG_locked (lock_page)
+ *       mapping->i_mmap_lock
+ *         anon_vma->lock
+ *           mm->page_table_lock or pte_lock
+ *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *             swap_lock (in swap_duplicate, swap_info_get)
+ *               mmlist_lock (in mmput, drain_mmlist and others)
+ *               mapping->private_lock (in __set_page_dirty_buffers)
+ *               inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *                 sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                 mapping->tree_lock (widely used, in set_page_dirty,
+ *                           in arch-dependent flush_dcache_mmap_lock,
+ *                           within inode_lock in __sync_single_inode)
  */
 
 #include <linux/mm.h>
-- 
cgit v1.2.3


From 2ae88149a27cadf2840e0ab8155bef13be285c03 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 28 Oct 2006 10:38:23 -0700
Subject: [PATCH] mm: clean up pagecache allocation

- Consolidate page_cache_alloc

- Fix splice: only the pagecache pages and filesystem data need to use
  mapping_gfp_mask.

- Fix grab_cache_page_nowait: same as splice, also honour NUMA placement.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/splice.c             |  9 ++++-----
 include/linux/pagemap.h | 14 +++++++++-----
 mm/filemap.c            | 24 ++++++------------------
 3 files changed, 19 insertions(+), 28 deletions(-)

(limited to 'mm/filemap.c')

diff --git a/fs/splice.c b/fs/splice.c
index 49fb9f129938..8d705954d294 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -74,7 +74,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		wait_on_page_writeback(page);
 
 		if (PagePrivate(page))
-			try_to_release_page(page, mapping_gfp_mask(mapping));
+			try_to_release_page(page, GFP_KERNEL);
 
 		/*
 		 * If we succeeded in removing the mapping, set LRU flag
@@ -333,7 +333,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				break;
 
 			error = add_to_page_cache_lru(page, mapping, index,
-					      mapping_gfp_mask(mapping));
+					      GFP_KERNEL);
 			if (unlikely(error)) {
 				page_cache_release(page);
 				if (error == -EEXIST)
@@ -557,7 +557,6 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 {
 	struct file *file = sd->file;
 	struct address_space *mapping = file->f_mapping;
-	gfp_t gfp_mask = mapping_gfp_mask(mapping);
 	unsigned int offset, this_len;
 	struct page *page;
 	pgoff_t index;
@@ -591,7 +590,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			goto find_page;
 
 		page = buf->page;
-		if (add_to_page_cache(page, mapping, index, gfp_mask)) {
+		if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) {
 			unlock_page(page);
 			goto find_page;
 		}
@@ -613,7 +612,7 @@ find_page:
 			 * This will also lock the page
 			 */
 			ret = add_to_page_cache_lru(page, mapping, index,
-						    gfp_mask);
+						    GFP_KERNEL);
 			if (unlikely(ret))
 				goto out;
 		}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 64f950925151..c3e255bf8594 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -52,19 +52,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 void release_pages(struct page **pages, int nr, int cold);
 
 #ifdef CONFIG_NUMA
-extern struct page *page_cache_alloc(struct address_space *x);
-extern struct page *page_cache_alloc_cold(struct address_space *x);
+extern struct page *__page_cache_alloc(gfp_t gfp);
 #else
+static inline struct page *__page_cache_alloc(gfp_t gfp)
+{
+	return alloc_pages(gfp, 0);
+}
+#endif
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return alloc_pages(mapping_gfp_mask(x), 0);
+	return __page_cache_alloc(mapping_gfp_mask(x));
 }
 
 static inline struct page *page_cache_alloc_cold(struct address_space *x)
 {
-	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
 }
-#endif
 
 typedef int filler_t(void *, struct page *);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index cb26e33fd0ff..7b84dc814347 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 }
 
 #ifdef CONFIG_NUMA
-struct page *page_cache_alloc(struct address_space *x)
+struct page *__page_cache_alloc(gfp_t gfp)
 {
 	if (cpuset_do_page_mem_spread()) {
 		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+		return alloc_pages_node(n, gfp, 0);
 	}
-	return alloc_pages(mapping_gfp_mask(x), 0);
+	return alloc_pages(gfp, 0);
 }
-EXPORT_SYMBOL(page_cache_alloc);
-
-struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	if (cpuset_do_page_mem_spread()) {
-		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
-	}
-	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
-}
-EXPORT_SYMBOL(page_cache_alloc_cold);
+EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
 static int __sleep_on_page_lock(void *word)
@@ -826,7 +816,6 @@ struct page *
 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 {
 	struct page *page = find_get_page(mapping, index);
-	gfp_t gfp_mask;
 
 	if (page) {
 		if (!TestSetPageLocked(page))
@@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 		page_cache_release(page);
 		return NULL;
 	}
-	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
-	page = alloc_pages(gfp_mask, 0);
-	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
+	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
 		page_cache_release(page);
 		page = NULL;
 	}
-- 
cgit v1.2.3