From c52c47e4b4fbe4284602fc2ccbfc4a4d8dc05b49 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 29 Apr 2017 20:12:16 -0400
Subject: jbd2: Fix lockdep splat with generic/270 test

I've hit a lockdep splat with generic/270 test complaining that:

3216.fsstress.b/3533 is trying to acquire lock:
 (jbd2_handle){++++..}, at: [<ffffffff813152e0>] jbd2_log_wait_commit+0x0/0x150

but task is already holding lock:
 (jbd2_handle){++++..}, at: [<ffffffff8130bd3b>] start_this_handle+0x35b/0x850

The underlying problem is that jbd2_journal_force_commit_nested()
(called from ext4_should_retry_alloc()) may get called while a
transaction handle is started. In such case it takes care to not wait
for commit of the running transaction (which would deadlock) but only
for a commit of a transaction that is already committing (which is safe
as that doesn't wait for any filesystem locks).

In fact there are also other callers of jbd2_log_wait_commit() that take
care to pass tid of a transaction that is already committing and for
those cases, the lockdep instrumentation is too restrictive and leading
to false positive reports. Fix the problem by calling
jbd2_might_wait_for_commit() from jbd2_log_wait_commit() only if the
transaction isn't already committing.

Fixes: 1eaa566d368b214d99cbb973647c1b0b8102a9ae
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5adc2fb62b0f..9410ec462ba6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -691,8 +691,21 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
 	int err = 0;
 
-	jbd2_might_wait_for_commit(journal);
 	read_lock(&journal->j_state_lock);
+#ifdef CONFIG_PROVE_LOCKING
+	/*
+	 * Some callers make sure transaction is already committing and in that
+	 * case we cannot block on open handles anymore. So don't warn in that
+	 * case.
+	 */
+	if (tid_gt(tid, journal->j_commit_sequence) &&
+	    (!journal->j_committing_transaction ||
+	     journal->j_committing_transaction->t_tid != tid)) {
+		read_unlock(&journal->j_state_lock);
+		jbd2_might_wait_for_commit(journal);
+		read_lock(&journal->j_state_lock);
+	}
+#endif
 #ifdef CONFIG_JBD2_DEBUG
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_ERR
-- 
cgit v1.2.3


From 5052b069acf73866d00077d8bc49983c3ee903e5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 29 Apr 2017 21:07:30 -0400
Subject: jbd2: fix dbench4 performance regression for 'nobarrier' mounts

Commit b685d3d65ac7 "block: treat REQ_FUA and REQ_PREFLUSH as
synchronous" removed REQ_SYNC flag from WRITE_FUA implementation. Since
JBD2 strips REQ_FUA and REQ_FLUSH flags from submitted IO when the
filesystem is mounted with nobarrier mount option, journal superblock
writes ended up being async writes after this patch and that caused
heavy performance regression for dbench4 benchmark with high number of
processes. In my test setup with HP RAID array with non-volatile write
cache and 32 GB ram, dbench4 runs with 8 processes regressed by ~25%.

Fix the problem by making sure journal superblock writes are always
treated as synchronous since they generally block progress of the
journalling machinery and thus the whole filesystem.

Fixes: b685d3d65ac791406e0dfd8779cc9b3707fea5a3
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9410ec462ba6..f1906fa54321 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1361,7 +1361,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 	jbd2_superblock_csum_set(journal, sb);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_write_sync;
-	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
+	ret = submit_bh(REQ_OP_WRITE, write_flags | REQ_SYNC, bh);
 	wait_on_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		clear_buffer_write_io_error(bh);
-- 
cgit v1.2.3


From 1bc0af600b011dbbf9bbf39664b858ea2e365729 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 29 Apr 2017 23:27:26 -0400
Subject: ext4: trim return value and 'dir' argument from ext4_insert_dentry()

In the initial implementation of ext4 encryption, the filename was
encrypted in ext4_insert_dentry(), which could fail and also required
access to the 'dir' inode.  Since then ext4 filename encryption has been
changed to encrypt the filename earlier, so we can revert the additions
to ext4_insert_dentry().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  9 ++++-----
 fs/ext4/inline.c |  2 +-
 fs/ext4/namei.c  | 17 ++++++-----------
 3 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fb69ee2388db..deb36ece85bf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2356,11 +2356,10 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 			     void *buf, int buf_size,
 			     struct ext4_filename *fname,
 			     struct ext4_dir_entry_2 **dest_de);
-int ext4_insert_dentry(struct inode *dir,
-		       struct inode *inode,
-		       struct ext4_dir_entry_2 *de,
-		       int buf_size,
-		       struct ext4_filename *fname);
+void ext4_insert_dentry(struct inode *inode,
+			struct ext4_dir_entry_2 *de,
+			int buf_size,
+			struct ext4_filename *fname);
 static inline void ext4_update_dx_flag(struct inode *inode)
 {
 	if (!ext4_has_feature_dir_index(inode->i_sb))
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 375fb1c05d49..d5dea4c293ef 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1034,7 +1034,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	err = ext4_journal_get_write_access(handle, iloc->bh);
 	if (err)
 		return err;
-	ext4_insert_dentry(dir, inode, de, inline_size, fname);
+	ext4_insert_dentry(inode, de, inline_size, fname);
 
 	ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07e5e1405771..6577a3c45815 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1869,11 +1869,10 @@ return_result:
 	return res;
 }
 
-int ext4_insert_dentry(struct inode *dir,
-		       struct inode *inode,
-		       struct ext4_dir_entry_2 *de,
-		       int buf_size,
-		       struct ext4_filename *fname)
+void ext4_insert_dentry(struct inode *inode,
+			struct ext4_dir_entry_2 *de,
+			int buf_size,
+			struct ext4_filename *fname)
 {
 
 	int nlen, rlen;
@@ -1892,7 +1891,6 @@ int ext4_insert_dentry(struct inode *dir,
 	ext4_set_de_type(inode->i_sb, de, inode->i_mode);
 	de->name_len = fname_len(fname);
 	memcpy(de->name, fname_name(fname), fname_len(fname));
-	return 0;
 }
 
 /*
@@ -1928,11 +1926,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 		return err;
 	}
 
-	/* By now the buffer is marked for journaling. Due to crypto operations,
-	 * the following function call may fail */
-	err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
-	if (err < 0)
-		return err;
+	/* By now the buffer is marked for journaling */
+	ext4_insert_dentry(inode, de, blocksize, fname);
 
 	/*
 	 * XXX shouldn't update any times until successful
-- 
cgit v1.2.3


From d60061867303aa2fee516e9a34efc15e78d975a9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 29 Apr 2017 23:47:50 -0400
Subject: ext4: constify static data that is never modified

Constify static data in ext4 that is never (intentionally) modified so
that it is placed in .rodata and benefits from memory protection.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 4 ++--
 fs/ext4/mballoc.c | 2 +-
 fs/ext4/super.c   | 5 +++--
 fs/ext4/sysfs.c   | 8 ++++----
 fs/ext4/xattr.c   | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index deb36ece85bf..13742d6c83d2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2365,7 +2365,7 @@ static inline void ext4_update_dx_flag(struct inode *inode)
 	if (!ext4_has_feature_dir_index(inode->i_sb))
 		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
-static unsigned char ext4_filetype_table[] = {
+static const unsigned char ext4_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
@@ -3050,7 +3050,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle,
 					 struct inode *inode,
 					 struct buffer_head *bh);
 #define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
 	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 354dc1a894c2..dbe51301c2bb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -357,7 +357,7 @@ static struct kmem_cache *ext4_free_data_cachep;
 #define NR_GRPINFO_CACHES 8
 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 
-static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9448db1cf7e..7b82487401d2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1208,7 +1208,7 @@ static const struct fscrypt_operations ext4_cryptops = {
 #endif
 
 #ifdef CONFIG_QUOTA
-static char *quotatypes[] = INITQFNAMES;
+static const char * const quotatypes[] = INITQFNAMES;
 #define QTYPE2NAME(t) (quotatypes[t])
 
 static int ext4_write_dquot(struct dquot *dquot);
@@ -1422,7 +1422,8 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+static const char deprecated_msg[] =
+	"Mount option \"%s\" will be removed by %s\n"
 	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 42145be5c6b4..d74dc5f81a04 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -34,7 +34,7 @@ typedef enum {
 	ptr_ext4_super_block_offset,
 } attr_ptr_t;
 
-static const char *proc_dirname = "fs/ext4";
+static const char proc_dirname[] = "fs/ext4";
 static struct proc_dir_entry *ext4_proc_root;
 
 struct ext4_attr {
@@ -375,7 +375,7 @@ static const struct file_operations ext4_seq_##name##_fops = { \
 PROC_FILE_SHOW_DEFN(es_shrinker_info);
 PROC_FILE_SHOW_DEFN(options);
 
-static struct ext4_proc_files {
+static const struct ext4_proc_files {
 	const char *name;
 	const struct file_operations *fops;
 } proc_files[] = {
@@ -388,7 +388,7 @@ static struct ext4_proc_files {
 int ext4_register_sysfs(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_proc_files *p;
+	const struct ext4_proc_files *p;
 	int err;
 
 	sbi->s_kobj.kset = &ext4_kset;
@@ -412,7 +412,7 @@ int ext4_register_sysfs(struct super_block *sb)
 void ext4_unregister_sysfs(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_proc_files *p;
+	const struct ext4_proc_files *p;
 
 	if (sbi->s_proc) {
 		for (p = proc_files; p->name; p++)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 996e7900d4c8..a9ce2cf2ae69 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
 static int ext4_xattr_list(struct dentry *dentry, char *buffer,
 			   size_t buffer_size);
 
-static const struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-- 
cgit v1.2.3


From ba7ea1d8f45383cb88858057e9bf60cd8cf3b898 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 29 Apr 2017 23:53:17 -0400
Subject: ext4: merge ext4_xattr_list() into ext4_listxattr()

There's no difference between ext4_xattr_list() and ext4_listxattr(), so
merge them together and just have ext4_listxattr().  Some years ago they
took different arguments, but that's no longer the case.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a9ce2cf2ae69..8e9d0d4c6234 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -78,8 +78,6 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
 						 struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
-static int ext4_xattr_list(struct dentry *dentry, char *buffer,
-			   size_t buffer_size);
 
 static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
@@ -163,17 +161,6 @@ ext4_xattr_handler(int name_index)
 	return handler;
 }
 
-/*
- * Inode operation listxattr()
- *
- * d_inode(dentry)->i_mutex: don't care
- */
-ssize_t
-ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	return ext4_xattr_list(dentry, buffer, size);
-}
-
 static int
 ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 		       void *value_start)
@@ -519,7 +506,9 @@ cleanup:
 }
 
 /*
- * ext4_xattr_list()
+ * Inode operation listxattr()
+ *
+ * d_inode(dentry)->i_rwsem: don't care
  *
  * Copy a list of attribute names into the buffer
  * provided, or compute the buffer size required.
@@ -528,8 +517,8 @@ cleanup:
  * Returns a negative error number on failure, or the number of bytes
  * used / required on success.
  */
-static int
-ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	int ret, ret2;
 
-- 
cgit v1.2.3


From 2c4f9923374843384cfd3a2edc0f02c19a7a29e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 29 Apr 2017 23:56:52 -0400
Subject: ext4: rename ext4_xattr_check_names() to ext4_xattr_check_entries()

ext4_xattr_check_names() actually validates both the xattr names and
values, not just the names.  So rename it to ext4_xattr_check_entries()
to avoid confusion.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8e9d0d4c6234..c5a3e98bd74d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -162,8 +162,8 @@ ext4_xattr_handler(int name_index)
 }
 
 static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
-		       void *value_start)
+ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
+			 void *value_start)
 {
 	struct ext4_xattr_entry *e = entry;
 
@@ -217,8 +217,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
 		return -EFSCORRUPTED;
 	if (!ext4_xattr_block_csum_verify(inode, bh))
 		return -EFSBADCRC;
-	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
-				       bh->b_data);
+	error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
+					 bh->b_data);
 	if (!error)
 		set_buffer_verified(bh);
 	return error;
@@ -233,7 +233,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
 	if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
 	    (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
 		goto errout;
-	error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
+	error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
 errout:
 	if (error)
 		__ext4_error_inode(inode, function, line, 0,
-- 
cgit v1.2.3


From 6ba644b9fd8ce243687139dbd8ca918ad4b09f61 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 30 Apr 2017 00:01:02 -0400
Subject: ext4: remove ext4_xattr_check_entry()

ext4_xattr_check_entry() was redundant with validation of the full xattr
entries list in ext4_xattr_check_entries(), which all callers also did.
ext4_xattr_check_entry() also didn't actually do correct validation;
specifically, it never checked that the value doesn't overlap the xattr
names, nor did it account for padding when checking whether the xattr
value overflows the available space.  So remove it to eliminate any
potential confusion.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 30 ++++++------------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c5a3e98bd74d..8fb7ce14e6eb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -244,20 +244,9 @@ errout:
 #define xattr_check_inode(inode, header, end) \
 	__xattr_check_inode((inode), (header), (end), __func__, __LINE__)
 
-static inline int
-ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
-{
-	size_t value_size = le32_to_cpu(entry->e_value_size);
-
-	if (entry->e_value_block != 0 || value_size > size ||
-	    le16_to_cpu(entry->e_value_offs) + value_size > size)
-		return -EFSCORRUPTED;
-	return 0;
-}
-
 static int
 ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
-		      const char *name, size_t size, int sorted)
+		      const char *name, int sorted)
 {
 	struct ext4_xattr_entry *entry;
 	size_t name_len;
@@ -277,8 +266,6 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 			break;
 	}
 	*pentry = entry;
-	if (!cmp && ext4_xattr_check_entry(entry, size))
-		return -EFSCORRUPTED;
 	return cmp ? -ENODATA : 0;
 }
 
@@ -306,7 +293,6 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(inode, bh)) {
-bad_block:
 		EXT4_ERROR_INODE(inode, "bad block %llu",
 				 EXT4_I(inode)->i_file_acl);
 		error = -EFSCORRUPTED;
@@ -314,9 +300,7 @@ bad_block:
 	}
 	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	entry = BFIRST(bh);
-	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
-	if (error == -EFSCORRUPTED)
-		goto bad_block;
+	error = ext4_xattr_find_entry(&entry, name_index, name, 1);
 	if (error)
 		goto cleanup;
 	size = le32_to_cpu(entry->e_value_size);
@@ -353,13 +337,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 		return error;
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
-	entry = IFIRST(header);
 	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
 	error = xattr_check_inode(inode, header, end);
 	if (error)
 		goto cleanup;
-	error = ext4_xattr_find_entry(&entry, name_index, name,
-				      end - (void *)entry, 0);
+	entry = IFIRST(header);
+	error = ext4_xattr_find_entry(&entry, name_index, name, 0);
 	if (error)
 		goto cleanup;
 	size = le32_to_cpu(entry->e_value_size);
@@ -793,7 +776,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 		bs->s.end = bs->bh->b_data + bs->bh->b_size;
 		bs->s.here = bs->s.first;
 		error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
-					      i->name, bs->bh->b_size, 1);
+					      i->name, 1);
 		if (error && error != -ENODATA)
 			goto cleanup;
 		bs->s.not_found = error;
@@ -1065,8 +1048,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 			return error;
 		/* Find the named attribute. */
 		error = ext4_xattr_find_entry(&is->s.here, i->name_index,
-					      i->name, is->s.end -
-					      (void *)is->s.base, 0);
+					      i->name, 0);
 		if (error && error != -ENODATA)
 			return error;
 		is->s.not_found = error;
-- 
cgit v1.2.3


From 7b4cc9787fe35b3ee2dfb1c35e22eafc32e00c33 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 30 Apr 2017 00:10:50 -0400
Subject: ext4: evict inline data when writing to memory map

Currently the case of writing via mmap to a file with inline data is not
handled.  This is maybe a rare case since it requires a writable memory
map of a very small file, but it is trivial to trigger with on
inline_data filesystem, and it causes the
'BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA));' in
ext4_writepages() to be hit:

    mkfs.ext4 -O inline_data /dev/vdb
    mount /dev/vdb /mnt
    xfs_io -f /mnt/file \
	-c 'pwrite 0 1' \
	-c 'mmap -w 0 1m' \
	-c 'mwrite 0 1' \
	-c 'fsync'

	kernel BUG at fs/ext4/inode.c:2723!
	invalid opcode: 0000 [#1] SMP
	CPU: 1 PID: 2532 Comm: xfs_io Not tainted 4.11.0-rc1-xfstests-00301-g071d9acf3d1f #633
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
	task: ffff88003d3a8040 task.stack: ffffc90000300000
	RIP: 0010:ext4_writepages+0xc89/0xf8a
	RSP: 0018:ffffc90000303ca0 EFLAGS: 00010283
	RAX: 0000028410000000 RBX: ffff8800383fa3b0 RCX: ffffffff812afcdc
	RDX: 00000a9d00000246 RSI: ffffffff81e660e0 RDI: 0000000000000246
	RBP: ffffc90000303dc0 R08: 0000000000000002 R09: 869618e8f99b4fa5
	R10: 00000000852287a2 R11: 00000000a03b49f4 R12: ffff88003808e698
	R13: 0000000000000000 R14: 7fffffffffffffff R15: 7fffffffffffffff
	FS:  00007fd3e53094c0(0000) GS:ffff88003e400000(0000) knlGS:0000000000000000
	CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
	CR2: 00007fd3e4c51000 CR3: 000000003d554000 CR4: 00000000003406e0
	Call Trace:
	 ? _raw_spin_unlock+0x27/0x2a
	 ? kvm_clock_read+0x1e/0x20
	 do_writepages+0x23/0x2c
	 ? do_writepages+0x23/0x2c
	 __filemap_fdatawrite_range+0x80/0x87
	 filemap_write_and_wait_range+0x67/0x8c
	 ext4_sync_file+0x20e/0x472
	 vfs_fsync_range+0x8e/0x9f
	 ? syscall_trace_enter+0x25b/0x2d0
	 vfs_fsync+0x1c/0x1e
	 do_fsync+0x31/0x4a
	 SyS_fsync+0x10/0x14
	 do_syscall_64+0x69/0x131
	 entry_SYSCALL64_slow_path+0x25/0x25

We could try to be smart and keep the inline data in this case, or at
least support delayed allocation when allocating the block, but these
solutions would be more complicated and don't seem worthwhile given how
rare this case seems to be.  So just fix the bug by calling
ext4_convert_inline_data() when we're asked to make a page writable, so
that any inline data gets evicted, with the block allocated immediately.

Reported-by: Nick Alcock <nick.alcock@oracle.com>
Cc: stable@vger.kernel.org
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b9ffa9f4191f..88203ae5b154 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5874,6 +5874,11 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
 	file_update_time(vma->vm_file);
 
 	down_read(&EXT4_I(inode)->i_mmap_sem);
+
+	ret = ext4_convert_inline_data(inode);
+	if (ret)
+		goto out_ret;
+
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
-- 
cgit v1.2.3


From 0c9ec4beecac94cb450c8abb2ac8b7e8a79240ea Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 30 Apr 2017 00:36:53 -0400
Subject: ext4: support GETFSMAP ioctls

Support the GETFSMAP ioctls so that we can use the xfs free space
management tools to probe ext4 as well.  Note that this is a partial
implementation -- we only report fixed-location metadata and free space;
everything else is reported as "unknown".

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/Makefile            |  10 +-
 fs/ext4/fsmap.c             | 722 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/fsmap.h             |  69 +++++
 fs/ext4/ioctl.c             |  90 ++++++
 fs/ext4/mballoc.c           |  49 +++
 fs/ext4/mballoc.h           |  17 ++
 fs/ext4/super.c             |   1 +
 include/trace/events/ext4.h |  74 +++++
 8 files changed, 1027 insertions(+), 5 deletions(-)
 create mode 100644 fs/ext4/fsmap.c
 create mode 100644 fs/ext4/fsmap.h

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 354103f3490c..d9beca1653c5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,11 +4,11 @@
 
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
-		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-		xattr_trusted.o inline.o readpage.o sysfs.o
+ext4-y	:= balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
+		extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
+		indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
+		mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
+		super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
new file mode 100644
index 000000000000..b19436098837
--- /dev/null
+++ b/fs/ext4/fsmap.c
@@ -0,0 +1,722 @@
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include "mballoc.h"
+#include <linux/sort.h>
+#include <linux/list_sort.h>
+#include <trace/events/ext4.h>
+
+/* Convert an ext4_fsmap to an fsmap. */
+void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest,
+			      struct ext4_fsmap *src)
+{
+	dest->fmr_device = src->fmr_device;
+	dest->fmr_flags = src->fmr_flags;
+	dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits;
+	dest->fmr_owner = src->fmr_owner;
+	dest->fmr_offset = 0;
+	dest->fmr_length = src->fmr_length << sb->s_blocksize_bits;
+	dest->fmr_reserved[0] = 0;
+	dest->fmr_reserved[1] = 0;
+	dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an ext4_fsmap. */
+void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest,
+			    struct fsmap *src)
+{
+	dest->fmr_device = src->fmr_device;
+	dest->fmr_flags = src->fmr_flags;
+	dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits;
+	dest->fmr_owner = src->fmr_owner;
+	dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits;
+}
+
+/* getfsmap query state */
+struct ext4_getfsmap_info {
+	struct ext4_fsmap_head	*gfi_head;
+	ext4_fsmap_format_t	gfi_formatter;	/* formatting fn */
+	void			*gfi_format_arg;/* format buffer */
+	ext4_fsblk_t		gfi_next_fsblk;	/* next fsblock we expect */
+	u32			gfi_dev;	/* device id */
+	ext4_group_t		gfi_agno;	/* bg number, if applicable */
+	struct ext4_fsmap	gfi_low;	/* low rmap key */
+	struct ext4_fsmap	gfi_high;	/* high rmap key */
+	struct ext4_fsmap	gfi_lastfree;	/* free ext at end of last bg */
+	struct list_head	gfi_meta_list;	/* fixed metadata list */
+	bool			gfi_last;	/* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct ext4_getfsmap_dev {
+	int			(*gfd_fn)(struct super_block *sb,
+				      struct ext4_fsmap *keys,
+				      struct ext4_getfsmap_info *info);
+	u32			gfd_dev;
+};
+
+/* Compare two getfsmap device handlers. */
+static int ext4_getfsmap_dev_compare(const void *p1, const void *p2)
+{
+	const struct ext4_getfsmap_dev *d1 = p1;
+	const struct ext4_getfsmap_dev *d2 = p2;
+
+	return d1->gfd_dev - d2->gfd_dev;
+}
+
+/* Compare a record against our starting point */
+static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info,
+					     struct ext4_fsmap *rec)
+{
+	return rec->fmr_physical < info->gfi_low.fmr_physical;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+static int ext4_getfsmap_helper(struct super_block *sb,
+				struct ext4_getfsmap_info *info,
+				struct ext4_fsmap *rec)
+{
+	struct ext4_fsmap fmr;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t rec_fsblk = rec->fmr_physical;
+	ext4_group_t agno;
+	ext4_grpblk_t cno;
+	int error;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	/*
+	 * Filter out records that start before our startpoint, if the
+	 * caller requested that.
+	 */
+	if (ext4_getfsmap_rec_before_low_key(info, rec)) {
+		rec_fsblk += rec->fmr_length;
+		if (info->gfi_next_fsblk < rec_fsblk)
+			info->gfi_next_fsblk = rec_fsblk;
+		return EXT4_QUERY_RANGE_CONTINUE;
+	}
+
+	/* Are we just counting mappings? */
+	if (info->gfi_head->fmh_count == 0) {
+		if (rec_fsblk > info->gfi_next_fsblk)
+			info->gfi_head->fmh_entries++;
+
+		if (info->gfi_last)
+			return EXT4_QUERY_RANGE_CONTINUE;
+
+		info->gfi_head->fmh_entries++;
+
+		rec_fsblk += rec->fmr_length;
+		if (info->gfi_next_fsblk < rec_fsblk)
+			info->gfi_next_fsblk = rec_fsblk;
+		return EXT4_QUERY_RANGE_CONTINUE;
+	}
+
+	/*
+	 * If the record starts past the last physical block we saw,
+	 * then we've found a gap.  Report the gap as being owned by
+	 * whatever the caller specified is the missing owner.
+	 */
+	if (rec_fsblk > info->gfi_next_fsblk) {
+		if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count)
+			return EXT4_QUERY_RANGE_ABORT;
+
+		ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk,
+				&agno, &cno);
+		trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno,
+				EXT4_C2B(sbi, cno),
+				rec_fsblk - info->gfi_next_fsblk,
+				EXT4_FMR_OWN_UNKNOWN);
+
+		fmr.fmr_device = info->gfi_dev;
+		fmr.fmr_physical = info->gfi_next_fsblk;
+		fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN;
+		fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk;
+		fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+		error = info->gfi_formatter(&fmr, info->gfi_format_arg);
+		if (error)
+			return error;
+		info->gfi_head->fmh_entries++;
+	}
+
+	if (info->gfi_last)
+		goto out;
+
+	/* Fill out the extent we found */
+	if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count)
+		return EXT4_QUERY_RANGE_ABORT;
+
+	ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno);
+	trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno),
+			rec->fmr_length, rec->fmr_owner);
+
+	fmr.fmr_device = info->gfi_dev;
+	fmr.fmr_physical = rec_fsblk;
+	fmr.fmr_owner = rec->fmr_owner;
+	fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+	fmr.fmr_length = rec->fmr_length;
+	error = info->gfi_formatter(&fmr, info->gfi_format_arg);
+	if (error)
+		return error;
+	info->gfi_head->fmh_entries++;
+
+out:
+	rec_fsblk += rec->fmr_length;
+	if (info->gfi_next_fsblk < rec_fsblk)
+		info->gfi_next_fsblk = rec_fsblk;
+	return EXT4_QUERY_RANGE_CONTINUE;
+}
+
+static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
+{
+	return fmr->fmr_physical + fmr->fmr_length;
+}
+
+/* Transform a blockgroup's free record into a fsmap */
+static int ext4_getfsmap_datadev_helper(struct super_block *sb,
+					ext4_group_t agno, ext4_grpblk_t start,
+					ext4_grpblk_t len, void *priv)
+{
+	struct ext4_fsmap irec;
+	struct ext4_getfsmap_info *info = priv;
+	struct ext4_fsmap *p;
+	struct ext4_fsmap *tmp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t fsb;
+	ext4_fsblk_t fslen;
+	int error;
+
+	fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno));
+	fslen = EXT4_C2B(sbi, len);
+
+	/* If the retained free extent record is set... */
+	if (info->gfi_lastfree.fmr_owner) {
+		/* ...and abuts this one, lengthen it and return. */
+		if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) {
+			info->gfi_lastfree.fmr_length += fslen;
+			return 0;
+		}
+
+		/*
+		 * There's a gap between the two free extents; emit the
+		 * retained extent prior to merging the meta_list.
+		 */
+		error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree);
+		if (error)
+			return error;
+		info->gfi_lastfree.fmr_owner = 0;
+	}
+
+	/* Merge in any relevant extents from the meta_list */
+	list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+		if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) {
+			list_del(&p->fmr_list);
+			kfree(p);
+		} else if (p->fmr_physical < fsb) {
+			error = ext4_getfsmap_helper(sb, info, p);
+			if (error)
+				return error;
+
+			list_del(&p->fmr_list);
+			kfree(p);
+		}
+	}
+
+	irec.fmr_device = 0;
+	irec.fmr_physical = fsb;
+	irec.fmr_length = fslen;
+	irec.fmr_owner = EXT4_FMR_OWN_FREE;
+	irec.fmr_flags = 0;
+
+	/* If this is a free extent at the end of a bg, buffer it. */
+	if (ext4_fsmap_next_pblk(&irec) ==
+			ext4_group_first_block_no(sb, agno + 1)) {
+		info->gfi_lastfree = irec;
+		return 0;
+	}
+
+	/* Otherwise, emit it */
+	return ext4_getfsmap_helper(sb, info, &irec);
+}
+
+/* Execute a getfsmap query against the log device. */
+static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
+				struct ext4_getfsmap_info *info)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+	struct ext4_fsmap irec;
+
+	/* Set up search keys */
+	info->gfi_low = keys[0];
+	info->gfi_low.fmr_length = 0;
+
+	memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
+
+	trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0,
+			info->gfi_low.fmr_physical,
+			info->gfi_low.fmr_length,
+			info->gfi_low.fmr_owner);
+
+	trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0,
+			info->gfi_high.fmr_physical,
+			info->gfi_high.fmr_length,
+			info->gfi_high.fmr_owner);
+
+	if (keys[0].fmr_physical > 0)
+		return 0;
+
+	/* Fabricate an rmap entry for the external log device. */
+	irec.fmr_physical = journal->j_blk_offset;
+	irec.fmr_length = journal->j_maxlen;
+	irec.fmr_owner = EXT4_FMR_OWN_LOG;
+	irec.fmr_flags = 0;
+
+	return ext4_getfsmap_helper(sb, info, &irec);
+}
+
+/* Helper to fill out an ext4_fsmap. */
+static inline int ext4_getfsmap_fill(struct list_head *meta_list,
+				     ext4_fsblk_t fsb, ext4_fsblk_t len,
+				     uint64_t owner)
+{
+	struct ext4_fsmap *fsm;
+
+	fsm = kmalloc(sizeof(*fsm), GFP_NOFS);
+	if (!fsm)
+		return -ENOMEM;
+	fsm->fmr_device = 0;
+	fsm->fmr_flags = 0;
+	fsm->fmr_physical = fsb;
+	fsm->fmr_owner = owner;
+	fsm->fmr_length = len;
+	list_add_tail(&fsm->fmr_list, meta_list);
+
+	return 0;
+}
+
+/*
+ * This function returns the number of file system metadata blocks at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
+					  ext4_group_t agno,
+					  struct list_head *meta_list)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno);
+	ext4_fsblk_t len;
+	unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+	unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb);
+	int error;
+
+	/* Record the superblock. */
+	if (ext4_bg_has_super(sb, agno)) {
+		error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS);
+		if (error)
+			return error;
+		fsb++;
+	}
+
+	/* Record the group descriptors. */
+	len = ext4_bg_num_gdb(sb, agno);
+	if (!len)
+		return 0;
+	error = ext4_getfsmap_fill(meta_list, fsb, len,
+				   EXT4_FMR_OWN_GDT);
+	if (error)
+		return error;
+	fsb += len;
+
+	/* Reserved GDT blocks */
+	if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
+		len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+		error = ext4_getfsmap_fill(meta_list, fsb, len,
+					   EXT4_FMR_OWN_RESV_GDT);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/* Compare two fsmap items. */
+static int ext4_getfsmap_compare(void *priv,
+				 struct list_head *a,
+				 struct list_head *b)
+{
+	struct ext4_fsmap *fa;
+	struct ext4_fsmap *fb;
+
+	fa = container_of(a, struct ext4_fsmap, fmr_list);
+	fb = container_of(b, struct ext4_fsmap, fmr_list);
+	if (fa->fmr_physical < fb->fmr_physical)
+		return -1;
+	else if (fa->fmr_physical > fb->fmr_physical)
+		return 1;
+	return 0;
+}
+
+/* Merge adjacent extents of fixed metadata. */
+static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list)
+{
+	struct ext4_fsmap *p;
+	struct ext4_fsmap *prev = NULL;
+	struct ext4_fsmap *tmp;
+
+	list_for_each_entry_safe(p, tmp, meta_list, fmr_list) {
+		if (!prev) {
+			prev = p;
+			continue;
+		}
+
+		if (prev->fmr_owner == p->fmr_owner &&
+		    prev->fmr_physical + prev->fmr_length == p->fmr_physical) {
+			prev->fmr_length += p->fmr_length;
+			list_del(&p->fmr_list);
+			kfree(p);
+		} else
+			prev = p;
+	}
+}
+
+/* Free a list of fixed metadata. */
+static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list)
+{
+	struct ext4_fsmap *p;
+	struct ext4_fsmap *tmp;
+
+	list_for_each_entry_safe(p, tmp, meta_list, fmr_list) {
+		list_del(&p->fmr_list);
+		kfree(p);
+	}
+}
+
+/* Find all the fixed metadata in the filesystem. */
+int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
+				      struct list_head *meta_list)
+{
+	struct ext4_group_desc *gdp;
+	ext4_group_t agno;
+	int error;
+
+	INIT_LIST_HEAD(meta_list);
+
+	/* Collect everything. */
+	for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) {
+		gdp = ext4_get_group_desc(sb, agno, NULL);
+		if (!gdp) {
+			error = -EFSCORRUPTED;
+			goto err;
+		}
+
+		/* Superblock & GDT */
+		error = ext4_getfsmap_find_sb(sb, agno, meta_list);
+		if (error)
+			goto err;
+
+		/* Block bitmap */
+		error = ext4_getfsmap_fill(meta_list,
+					   ext4_block_bitmap(sb, gdp), 1,
+					   EXT4_FMR_OWN_BLKBM);
+		if (error)
+			goto err;
+
+		/* Inode bitmap */
+		error = ext4_getfsmap_fill(meta_list,
+					   ext4_inode_bitmap(sb, gdp), 1,
+					   EXT4_FMR_OWN_INOBM);
+		if (error)
+			goto err;
+
+		/* Inodes */
+		error = ext4_getfsmap_fill(meta_list,
+					   ext4_inode_table(sb, gdp),
+					   EXT4_SB(sb)->s_itb_per_group,
+					   EXT4_FMR_OWN_INODES);
+		if (error)
+			goto err;
+	}
+
+	/* Sort the list */
+	list_sort(NULL, meta_list, ext4_getfsmap_compare);
+
+	/* Merge adjacent extents */
+	ext4_getfsmap_merge_fixed_metadata(meta_list);
+
+	return 0;
+err:
+	ext4_getfsmap_free_fixed_metadata(meta_list);
+	return error;
+}
+
+/* Execute a getfsmap query against the buddy bitmaps */
+static int ext4_getfsmap_datadev(struct super_block *sb,
+				 struct ext4_fsmap *keys,
+				 struct ext4_getfsmap_info *info)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t start_fsb;
+	ext4_fsblk_t end_fsb;
+	ext4_fsblk_t eofs;
+	ext4_group_t start_ag;
+	ext4_group_t end_ag;
+	ext4_grpblk_t first_cluster;
+	ext4_grpblk_t last_cluster;
+	int error = 0;
+
+	eofs = ext4_blocks_count(sbi->s_es);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	if (keys[1].fmr_physical >= eofs)
+		keys[1].fmr_physical = eofs - 1;
+	start_fsb = keys[0].fmr_physical;
+	end_fsb = keys[1].fmr_physical;
+
+	/* Determine first and last group to examine based on start and end */
+	ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster);
+	ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster);
+
+	/*
+	 * Convert the fsmap low/high keys to bg based keys.  Initialize
+	 * low to the fsmap low key and max out the high key to the end
+	 * of the bg.
+	 */
+	info->gfi_low = keys[0];
+	info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster);
+	info->gfi_low.fmr_length = 0;
+
+	memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
+
+	/* Assemble a list of all the fixed-location metadata. */
+	error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list);
+	if (error)
+		goto err;
+
+	/* Query each bg */
+	for (info->gfi_agno = start_ag;
+	     info->gfi_agno <= end_ag;
+	     info->gfi_agno++) {
+		/*
+		 * Set the bg high key from the fsmap high key if this
+		 * is the last bg that we're querying.
+		 */
+		if (info->gfi_agno == end_ag) {
+			info->gfi_high = keys[1];
+			info->gfi_high.fmr_physical = EXT4_C2B(sbi,
+					last_cluster);
+			info->gfi_high.fmr_length = 0;
+		}
+
+		trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno,
+				info->gfi_low.fmr_physical,
+				info->gfi_low.fmr_length,
+				info->gfi_low.fmr_owner);
+
+		trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno,
+				info->gfi_high.fmr_physical,
+				info->gfi_high.fmr_length,
+				info->gfi_high.fmr_owner);
+
+		error = ext4_mballoc_query_range(sb, info->gfi_agno,
+				EXT4_B2C(sbi, info->gfi_low.fmr_physical),
+				EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+				ext4_getfsmap_datadev_helper, info);
+		if (error)
+			goto err;
+
+		/*
+		 * Set the bg low key to the start of the bg prior to
+		 * moving on to the next bg.
+		 */
+		if (info->gfi_agno == start_ag)
+			memset(&info->gfi_low, 0, sizeof(info->gfi_low));
+	}
+
+	/* Do we have a retained free extent? */
+	if (info->gfi_lastfree.fmr_owner) {
+		error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree);
+		if (error)
+			goto err;
+	}
+
+	/* Report any gaps at the end of the bg */
+	info->gfi_last = true;
+	error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+	if (error)
+		goto err;
+
+err:
+	ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list);
+	return error;
+}
+
+/* Do we recognize the device? */
+static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
+					  struct ext4_fsmap *fm)
+{
+	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+	    fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
+		return true;
+	if (EXT4_SB(sb)->journal_bdev &&
+	    fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+		return true;
+	return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key,
+				     struct ext4_fsmap *high_key)
+{
+	if (low_key->fmr_device > high_key->fmr_device)
+		return false;
+	if (low_key->fmr_device < high_key->fmr_device)
+		return true;
+
+	if (low_key->fmr_physical > high_key->fmr_physical)
+		return false;
+	if (low_key->fmr_physical < high_key->fmr_physical)
+		return true;
+
+	if (low_key->fmr_owner > high_key->fmr_owner)
+		return false;
+	if (low_key->fmr_owner < high_key->fmr_owner)
+		return true;
+
+	return false;
+}
+
+#define EXT4_GETFSMAP_DEVS	2
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output.  Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * _fsmap_head.fmh_keys		-- low and high fsmap keys passed in;
+ * 				   these reflect fs-wide block addrs.
+ * dkeys			-- fmh_keys used to query each device;
+ * 				   these are fmh_keys but w/ the low key
+ * 				   bumped up by fmr_length.
+ * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this
+ *				   is how we detect gaps in the fsmap
+ *				   records and report them.
+ * _getfsmap_info.gfi_low/high	-- per-bg low/high keys computed from
+ * 				   dkeys; used to query the free space.
+ */
+int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
+		  ext4_fsmap_format_t formatter, void *arg)
+{
+	struct ext4_fsmap dkeys[2];	/* per-dev keys */
+	struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS];
+	struct ext4_getfsmap_info info = {0};
+	int i;
+	int error = 0;
+
+	if (head->fmh_iflags & ~FMH_IF_VALID)
+		return -EINVAL;
+	if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) ||
+	    !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1]))
+		return -EINVAL;
+
+	head->fmh_entries = 0;
+
+	/* Set up our device handlers. */
+	memset(handlers, 0, sizeof(handlers));
+	handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
+	handlers[0].gfd_fn = ext4_getfsmap_datadev;
+	if (EXT4_SB(sb)->journal_bdev) {
+		handlers[1].gfd_dev = new_encode_dev(
+				EXT4_SB(sb)->journal_bdev->bd_dev);
+		handlers[1].gfd_fn = ext4_getfsmap_logdev;
+	}
+
+	sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev),
+			ext4_getfsmap_dev_compare, NULL);
+
+	/*
+	 * To continue where we left off, we allow userspace to use the
+	 * last mapping from a previous call as the low key of the next.
+	 * This is identified by a non-zero length in the low key. We
+	 * have to increment the low key in this scenario to ensure we
+	 * don't return the same mapping again, and instead return the
+	 * very next mapping.
+	 *
+	 * Bump the physical offset as there can be no other mapping for
+	 * the same physical block range.
+	 */
+	dkeys[0] = head->fmh_keys[0];
+	dkeys[0].fmr_physical += dkeys[0].fmr_length;
+	dkeys[0].fmr_owner = 0;
+	dkeys[0].fmr_length = 0;
+	memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap));
+
+	if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+		return -EINVAL;
+
+	info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical +
+			  head->fmh_keys[0].fmr_length;
+	info.gfi_formatter = formatter;
+	info.gfi_format_arg = arg;
+	info.gfi_head = head;
+
+	/* For each device we support... */
+	for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) {
+		/* Is this device within the range the user asked for? */
+		if (!handlers[i].gfd_fn)
+			continue;
+		if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev)
+			continue;
+		if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev)
+			break;
+
+		/*
+		 * If this device number matches the high key, we have
+		 * to pass the high key to the handler to limit the
+		 * query results.  If the device number exceeds the
+		 * low key, zero out the low key so that we get
+		 * everything from the beginning.
+		 */
+		if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device)
+			dkeys[1] = head->fmh_keys[1];
+		if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device)
+			memset(&dkeys[0], 0, sizeof(struct ext4_fsmap));
+
+		info.gfi_dev = handlers[i].gfd_dev;
+		info.gfi_last = false;
+		info.gfi_agno = -1;
+		error = handlers[i].gfd_fn(sb, dkeys, &info);
+		if (error)
+			break;
+		info.gfi_next_fsblk = 0;
+	}
+
+	head->fmh_oflags = FMH_OF_DEV_T;
+	return error;
+}
diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h
new file mode 100644
index 000000000000..9a2cd367cc66
--- /dev/null
+++ b/fs/ext4/fsmap.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __EXT4_FSMAP_H__
+#define	__EXT4_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct ext4_fsmap {
+	struct list_head	fmr_list;
+	dev_t		fmr_device;	/* device id */
+	uint32_t	fmr_flags;	/* mapping flags */
+	uint64_t	fmr_physical;	/* device offset of segment */
+	uint64_t	fmr_owner;	/* owner id */
+	uint64_t	fmr_length;	/* length of segment, blocks */
+};
+
+struct ext4_fsmap_head {
+	uint32_t	fmh_iflags;	/* control flags */
+	uint32_t	fmh_oflags;	/* output flags */
+	unsigned int	fmh_count;	/* # of entries in array incl. input */
+	unsigned int	fmh_entries;	/* # of entries filled in (output). */
+
+	struct ext4_fsmap fmh_keys[2];	/* low and high keys */
+};
+
+void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest,
+		struct ext4_fsmap *src);
+void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest,
+		struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *);
+
+int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
+		ext4_fsmap_format_t formatter, void *arg);
+
+#define EXT4_QUERY_RANGE_ABORT		1
+#define EXT4_QUERY_RANGE_CONTINUE	0
+
+/*	fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */
+#define EXT4_FMR_OWN_FREE	FMR_OWN_FREE      /* free space */
+#define EXT4_FMR_OWN_UNKNOWN	FMR_OWN_UNKNOWN   /* unknown owner */
+#define EXT4_FMR_OWN_FS		FMR_OWNER('X', 1) /* static fs metadata */
+#define EXT4_FMR_OWN_LOG	FMR_OWNER('X', 2) /* journalling log */
+#define EXT4_FMR_OWN_INODES	FMR_OWNER('X', 5) /* inodes */
+#define EXT4_FMR_OWN_GDT	FMR_OWNER('f', 1) /* group descriptors */
+#define EXT4_FMR_OWN_RESV_GDT	FMR_OWNER('f', 2) /* reserved gdt blocks */
+#define EXT4_FMR_OWN_BLKBM	FMR_OWNER('f', 3) /* inode bitmap */
+#define EXT4_FMR_OWN_INOBM	FMR_OWNER('f', 4) /* block bitmap */
+
+#endif /* __EXT4_FSMAP_H__ */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a4273ddb9922..dfcb815275f1 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -19,6 +19,9 @@
 #include <linux/delay.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include <trace/events/ext4.h>
 
 /**
  * Swap memory between @a and @b for @len bytes.
@@ -489,6 +492,90 @@ int ext4_shutdown(struct super_block *sb, unsigned long arg)
 	return 0;
 }
 
+struct getfsmap_info {
+	struct super_block	*gi_sb;
+	struct fsmap_head __user *gi_data;
+	unsigned int		gi_idx;
+	__u32			gi_last_flags;
+};
+
+static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv)
+{
+	struct getfsmap_info *info = priv;
+	struct fsmap fm;
+
+	trace_ext4_getfsmap_mapping(info->gi_sb, xfm);
+
+	info->gi_last_flags = xfm->fmr_flags;
+	ext4_fsmap_from_internal(info->gi_sb, &fm, xfm);
+	if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm,
+			sizeof(struct fsmap)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ext4_ioc_getfsmap(struct super_block *sb,
+			     struct fsmap_head __user *arg)
+{
+	struct getfsmap_info info = {0};
+	struct ext4_fsmap_head xhead = {0};
+	struct fsmap_head head;
+	bool aborted = false;
+	int error;
+
+	if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+		return -EFAULT;
+	if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+	    memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+		       sizeof(head.fmh_keys[0].fmr_reserved)) ||
+	    memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+		       sizeof(head.fmh_keys[1].fmr_reserved)))
+		return -EINVAL;
+	/*
+	 * ext4 doesn't report file extents at all, so the only valid
+	 * file offsets are the magic ones (all zeroes or all ones).
+	 */
+	if (head.fmh_keys[0].fmr_offset ||
+	    (head.fmh_keys[1].fmr_offset != 0 &&
+	     head.fmh_keys[1].fmr_offset != -1ULL))
+		return -EINVAL;
+
+	xhead.fmh_iflags = head.fmh_iflags;
+	xhead.fmh_count = head.fmh_count;
+	ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]);
+	ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+	trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]);
+	trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]);
+
+	info.gi_sb = sb;
+	info.gi_data = arg;
+	error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
+	if (error == EXT4_QUERY_RANGE_ABORT) {
+		error = 0;
+		aborted = true;
+	} else if (error)
+		return error;
+
+	/* If we didn't abort, set the "last" flag in the last fmx */
+	if (!aborted && info.gi_idx) {
+		info.gi_last_flags |= FMR_OF_LAST;
+		if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags,
+				 &info.gi_last_flags,
+				 sizeof(info.gi_last_flags)))
+			return -EFAULT;
+	}
+
+	/* copy back header */
+	head.fmh_entries = xhead.fmh_entries;
+	head.fmh_oflags = xhead.fmh_oflags;
+	if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+		return -EFAULT;
+
+	return 0;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -499,6 +586,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
 
 	switch (cmd) {
+	case FS_IOC_GETFSMAP:
+		return ext4_ioc_getfsmap(sb, (void __user *)arg);
 	case EXT4_IOC_GETFLAGS:
 		ext4_get_inode_flags(ei);
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
@@ -1009,6 +1098,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
 	case EXT4_IOC_SHUTDOWN:
+	case FS_IOC_GETFSMAP:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbe51301c2bb..36de58a37653 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5277,3 +5277,52 @@ out:
 	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
 	return ret;
 }
+
+/* Iterate all the free extents in the group. */
+int
+ext4_mballoc_query_range(
+	struct super_block		*sb,
+	ext4_group_t			group,
+	ext4_grpblk_t			start,
+	ext4_grpblk_t			end,
+	ext4_mballoc_query_range_fn	formatter,
+	void				*priv)
+{
+	void				*bitmap;
+	ext4_grpblk_t			next;
+	struct ext4_buddy		e4b;
+	int				error;
+
+	error = ext4_mb_load_buddy(sb, group, &e4b);
+	if (error)
+		return error;
+	bitmap = e4b.bd_bitmap;
+
+	ext4_lock_group(sb, group);
+
+	start = (e4b.bd_info->bb_first_free > start) ?
+		e4b.bd_info->bb_first_free : start;
+	if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
+		end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+	while (start <= end) {
+		start = mb_find_next_zero_bit(bitmap, end + 1, start);
+		if (start > end)
+			break;
+		next = mb_find_next_bit(bitmap, end + 1, start);
+
+		ext4_unlock_group(sb, group);
+		error = formatter(sb, group, start, next - start, priv);
+		if (error)
+			goto out_unload;
+		ext4_lock_group(sb, group);
+
+		start = next + 1;
+	}
+
+	ext4_unlock_group(sb, group);
+out_unload:
+	ext4_mb_unload_buddy(&e4b);
+
+	return error;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 1aba469f8220..2bed62084a8c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -199,4 +199,21 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 	return ext4_group_first_block_no(sb, fex->fe_group) +
 		(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
 }
+
+typedef int (*ext4_mballoc_query_range_fn)(
+	struct super_block		*sb,
+	ext4_group_t			agno,
+	ext4_grpblk_t			start,
+	ext4_grpblk_t			len,
+	void				*priv);
+
+int
+ext4_mballoc_query_range(
+	struct super_block		*sb,
+	ext4_group_t			agno,
+	ext4_grpblk_t			start,
+	ext4_grpblk_t			end,
+	ext4_mballoc_query_range_fn	formatter,
+	void				*priv);
+
 #endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7b82487401d2..9ec8963ba604 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -49,6 +49,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "mballoc.h"
+#include "fsmap.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 09c71e9aaebf..dfae175ddebc 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -15,6 +15,7 @@ struct ext4_inode_info;
 struct mpage_da_data;
 struct ext4_map_blocks;
 struct extent_status;
+struct ext4_fsmap;
 
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
@@ -2529,6 +2530,79 @@ TRACE_EVENT(ext4_es_shrink,
 		  __entry->scan_time, __entry->nr_skipped, __entry->retried)
 );
 
+/* fsmap traces */
+DECLARE_EVENT_CLASS(ext4_fsmap_class,
+	TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
+		 u64 owner),
+	TP_ARGS(sb, keydev, agno, bno, len, owner),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, keydev)
+		__field(u32, agno)
+		__field(u64, bno)
+		__field(u64, len)
+		__field(u64, owner)
+	),
+	TP_fast_assign(
+		__entry->dev = sb->s_bdev->bd_dev;
+		__entry->keydev = new_decode_dev(keydev);
+		__entry->agno = agno;
+		__entry->bno = bno;
+		__entry->len = len;
+		__entry->owner = owner;
+	),
+	TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->len,
+		  __entry->owner)
+)
+#define DEFINE_FSMAP_EVENT(name) \
+DEFINE_EVENT(ext4_fsmap_class, name, \
+	TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
+		 u64 owner), \
+	TP_ARGS(sb, keydev, agno, bno, len, owner))
+DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
+DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
+DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);
+
+DECLARE_EVENT_CLASS(ext4_getfsmap_class,
+	TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
+	TP_ARGS(sb, fsmap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, keydev)
+		__field(u64, block)
+		__field(u64, len)
+		__field(u64, owner)
+		__field(u64, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = sb->s_bdev->bd_dev;
+		__entry->keydev = new_decode_dev(fsmap->fmr_device);
+		__entry->block = fsmap->fmr_physical;
+		__entry->len = fsmap->fmr_length;
+		__entry->owner = fsmap->fmr_owner;
+		__entry->flags = fsmap->fmr_flags;
+	),
+	TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
+		  __entry->block,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->flags)
+)
+#define DEFINE_GETFSMAP_EVENT(name) \
+DEFINE_EVENT(ext4_getfsmap_class, name, \
+	TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
+	TP_ARGS(sb, fsmap))
+DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
+DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
+DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 1a20a63084be8fc5b0c1191231dfd311f7fc2afa Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 30 Apr 2017 00:40:44 -0400
Subject: ext4: make ext4_shutdown() static

Make the ext4_shutdown() function static, as suggested by running sparse
('make C=2 fs/ext4/').  This was the only such warning in fs/ext4/.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dfcb815275f1..2779d5f291a4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -446,7 +446,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 	return iflags;
 }
 
-int ext4_shutdown(struct super_block *sb, unsigned long arg)
+static int ext4_shutdown(struct super_block *sb, unsigned long arg)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	__u32 flags;
-- 
cgit v1.2.3


From 85c8f176a6111ecde9c158109989dbd445a0e59a Mon Sep 17 00:00:00 2001
From: Andrew Perepechko <andrew.perepechko@seagate.com>
Date: Sun, 30 Apr 2017 00:46:35 -0400
Subject: ext4: preload block group descriptors

With enabled meta_bg option block group descriptors
reading IO is not sequential and requires optimization.

Signed-off-by: Andrew Perepechko <andrew.perepechko@seagate.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ec8963ba604..9a40e0c16dcc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3879,6 +3879,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	bgl_lock_init(sbi->s_blockgroup_lock);
 
+	/* Pre-read the descriptors into the buffer cache */
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logical_sb_block, i);
+		sb_breadahead(sb, block);
+	}
+
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
-- 
cgit v1.2.3


From dddbd6ac8f25b2d61e3e1c7134e3b3e1b9cd8670 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 30 Apr 2017 18:29:10 -0400
Subject: ext4: avoid unnecessary transaction stalls during writeback

Currently ext4_writepages() submits all pages with transaction started.
When no page needs block allocation or extent conversion we can submit
all dirty pages in the inode while holding a single transaction handle
and when device is congested this can take significant amount of time.
Thus ext4_writepages() can block transaction commits for extended
periods of time.

Take for example a simple benchmark simulating PostgreSQL database
(pgioperf in mmtest). The benchmark runs 16 processes doing random reads
from a huge file, one process doing random writes to the huge file, and
one process doing sequential writes to a small files and frequently
running fsync. With unpatched kernel transaction commits take on average
~18s with standard deviation of ~41s, top 5 commit times are:

274.466639s, 126.467347s, 86.992429s, 34.351563s, 31.517653s.

After this patch transaction commits take on average 0.1s with standard
deviation of 0.15s, top 5 commit times are:

0.563792s, 0.519980s, 0.509841s, 0.471700s, 0.469899s

[ Modified so we use an explicit do_map flag instead of relying on
  io_end not being allocated, the since io_end->inode is needed for I/O
  error handling. -- tytso ]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88203ae5b154..2a4507deb925 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1643,6 +1643,7 @@ struct mpage_da_data {
 	 */
 	struct ext4_map_blocks map;
 	struct ext4_io_submit io_submit;	/* IO submission data */
+	unsigned int do_map:1;
 };
 
 static void mpage_release_unused_pages(struct mpage_da_data *mpd,
@@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
 
 	/* First block in the extent? */
 	if (map->m_len == 0) {
+		/* We cannot map unless handle is started... */
+		if (!mpd->do_map)
+			return false;
 		map->m_lblk = lblk;
 		map->m_len = 1;
 		map->m_flags = bh->b_state & BH_FLAGS;
@@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 			/* Found extent to map? */
 			if (mpd->map.m_len)
 				return 0;
+			/* Buffer needs mapping and handle is not started? */
+			if (!mpd->do_map)
+				return 0;
 			/* Everything mapped so far and we hit EOF */
 			break;
 		}
@@ -2747,6 +2754,29 @@ retry:
 		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
 	done = false;
 	blk_start_plug(&plug);
+
+	/*
+	 * First writeback pages that don't need mapping - we can avoid
+	 * starting a transaction unnecessarily and also avoid being blocked
+	 * in the block layer on device congestion while having transaction
+	 * started.
+	 */
+	mpd.do_map = 0;
+	mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+	if (!mpd.io_submit.io_end) {
+		ret = -ENOMEM;
+		goto unplug;
+	}
+	ret = mpage_prepare_extent_to_map(&mpd);
+	/* Submit prepared bio */
+	ext4_io_submit(&mpd.io_submit);
+	ext4_put_io_end_defer(mpd.io_submit.io_end);
+	mpd.io_submit.io_end = NULL;
+	/* Unlock pages we didn't use */
+	mpage_release_unused_pages(&mpd, false);
+	if (ret < 0)
+		goto unplug;
+
 	while (!done && mpd.first_page <= mpd.last_page) {
 		/* For each extent of pages we use new io_end */
 		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
@@ -2775,8 +2805,10 @@ retry:
 				wbc->nr_to_write, inode->i_ino, ret);
 			/* Release allocated io_end */
 			ext4_put_io_end(mpd.io_submit.io_end);
+			mpd.io_submit.io_end = NULL;
 			break;
 		}
+		mpd.do_map = 1;
 
 		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
 		ret = mpage_prepare_extent_to_map(&mpd);
@@ -2807,6 +2839,7 @@ retry:
 		if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
 			ext4_journal_stop(handle);
 			handle = NULL;
+			mpd.do_map = 0;
 		}
 		/* Submit prepared bio */
 		ext4_io_submit(&mpd.io_submit);
@@ -2824,6 +2857,7 @@ retry:
 			ext4_journal_stop(handle);
 		} else
 			ext4_put_io_end(mpd.io_submit.io_end);
+		mpd.io_submit.io_end = NULL;
 
 		if (ret == -ENOSPC && sbi->s_journal) {
 			/*
@@ -2839,6 +2873,7 @@ retry:
 		if (ret)
 			break;
 	}
+unplug:
 	blk_finish_plug(&plug);
 	if (!ret && !cycled && wbc->nr_to_write > 0) {
 		cycled = 1;
-- 
cgit v1.2.3


From 72d622b42258a0ed0b6e8c0f40d7628de935d058 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 30 Apr 2017 20:08:05 -0400
Subject: ext4: replace BUG_ON with WARN_ONCE in ext4_end_bio()

Add fallback code and a WARN_ONCE() call instead of a BUG_ON() in
the ext4_end_bio() function.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/page-io.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 208241b06662..1a82138ba739 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -297,8 +297,17 @@ static void ext4_end_bio(struct bio *bio)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
+	char b[BDEVNAME_SIZE];
 
-	BUG_ON(!io_end);
+	if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
+		      bdevname(bio->bi_bdev, b),
+		      (long long) bio->bi_iter.bi_sector,
+		      (unsigned) bio_sectors(bio),
+		      bio->bi_error)) {
+		ext4_finish_bio(bio);
+		bio_put(bio);
+		return;
+	}
 	bio->bi_end_io = NULL;
 
 	if (bio->bi_error) {
-- 
cgit v1.2.3


From aa1dca3bd96bfd0ddf6871fc1844bd12ccce50f1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 2 May 2017 00:49:54 -0400
Subject: ext4: inherit encryption xattr before other xattrs

When using both encryption and SELinux (or another feature that requires
an xattr per file) on a filesystem with 256-byte inodes, each file's
xattrs usually spill into an external xattr block.  Currently, the
xattrs are inherited in the order ACL, security, then encryption.
Therefore, if spillage occurs, the encryption xattr will always end up
in the external block.  This is not ideal because the encryption xattrs
contain a nonce, so they will always be unique and will prevent the
external xattr blocks from being deduplicated.

To improve the situation, change the inheritance order to encryption,
ACL, then security.  This gives the encryption xattr a better chance to
be stored in-inode, allowing the other xattr(s) to be deduplicated.

Note that it may be better for userspace to format the filesystem with
512-byte inodes in this case.  However, it's not the default.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 17bc043308f3..98ac2f1f23b3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1098,6 +1098,17 @@ got:
 	if (err)
 		goto fail_drop;
 
+	/*
+	 * Since the encryption xattr will always be unique, create it first so
+	 * that it's less likely to end up in an external xattr block and
+	 * prevent its deduplication.
+	 */
+	if (encrypt) {
+		err = fscrypt_inherit_context(dir, inode, handle, true);
+		if (err)
+			goto fail_free_drop;
+	}
+
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
@@ -1119,12 +1130,6 @@ got:
 		ei->i_datasync_tid = handle->h_transaction->t_tid;
 	}
 
-	if (encrypt) {
-		err = fscrypt_inherit_context(dir, inode, handle, true);
-		if (err)
-			goto fail_free_drop;
-	}
-
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_std_error(sb, err);
-- 
cgit v1.2.3


From 00473374b72a6e5aba01af083f0d5100f3151210 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 May 2017 10:58:03 -0400
Subject: ext4: mark superblock writes synchronous for nobarrier mounts

Commit b685d3d65ac7 "block: treat REQ_FUA and REQ_PREFLUSH as
synchronous" removed REQ_SYNC flag from WRITE_FUA implementation.
generic_make_request_checks() however strips REQ_FUA flag from a bio
when the storage doesn't report volatile write cache and thus write
effectively becomes asynchronous which can lead to performance
regressions. This affects superblock writes for ext4. Fix the problem
by marking superblock writes always as synchronous.

Fixes: b685d3d65ac791406e0dfd8779cc9b3707fea5a3
CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9a40e0c16dcc..931053cf20d6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4637,7 +4637,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	if (sync) {
 		unlock_buffer(sbh);
 		error = __sync_dirty_buffer(sbh,
-			test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
+			REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
 		if (error)
 			return error;
 
-- 
cgit v1.2.3


From 17f423b5160767a8ec43b0602767e5f4d3ecd083 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 May 2017 11:01:31 -0400
Subject: jbd2: cleanup write flags handling from jbd2_write_superblock()

Currently jbd2_write_superblock() silently adds REQ_SYNC to flags with
which journal superblock is written. Make this explicit by making flags
passed down to jbd2_write_superblock() contain REQ_SYNC.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f1906fa54321..1ce13479b10d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -926,7 +926,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
+					      REQ_SYNC | REQ_FUA);
 	if (ret)
 		goto out;
 
@@ -1327,7 +1328,7 @@ static int journal_reset(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						REQ_FUA);
+						REQ_SYNC | REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 	return jbd2_journal_start_thread(journal);
@@ -1361,7 +1362,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 	jbd2_superblock_csum_set(journal, sb);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_write_sync;
-	ret = submit_bh(REQ_OP_WRITE, write_flags | REQ_SYNC, bh);
+	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
 	wait_on_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		clear_buffer_write_io_error(bh);
@@ -1467,7 +1468,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
 	read_unlock(&journal->j_state_lock);
 
-	jbd2_write_superblock(journal, REQ_FUA);
+	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
@@ -1734,7 +1735,7 @@ int jbd2_journal_destroy(journal_t *journal)
 			write_unlock(&journal->j_state_lock);
 
 			jbd2_mark_journal_empty(journal,
-					REQ_PREFLUSH | REQ_FUA);
+					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
@@ -1993,7 +1994,7 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal, REQ_FUA);
+	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
@@ -2039,7 +2040,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal, REQ_FUA);
+		jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
-- 
cgit v1.2.3