From ba06208a1315ab2d2217e09c79582b886c9f629e Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Sat, 3 Sep 2011 11:55:59 -0400
Subject: ext4: fix xfstests 75, 112, 127 punch hole failure

This patch addresses a bug found by xfstests 75, 112, 127
when blocksize = 1k

This bug happens because the punch hole code only zeros
out non block aligned regions of the page.  This means that if the
blocks are smaller than a page, then the block aligned regions of
the page inside the hole are left un-zeroed, and their buffer heads
are still mapped.  This bug is corrected by using
ext4_discard_partial_page_buffers to properly zero the partial page
at the head and tail of the hole, and unmap the corresponding buffer
heads

This patch also addresses a bug reported by Lukas while working on a
new patch to add discard support for loop devices using punch hole.
The bug happened because of the first and last block number
needed to be cast to a larger data type before calculating the
byte offset, but since now we only need the byte offsets of the
pages, we no longer even need to be calculating the byte offsets
of the blocks.  The code to do the block offset calculations is
removed in this patch.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
---
 fs/ext4/extents.c | 61 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 22 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568a98ab..18f7e04a4fa3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4162,17 +4162,14 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	struct address_space *mapping = inode->i_mapping;
 	struct ext4_map_blocks map;
 	handle_t *handle;
-	loff_t first_block_offset, last_block_offset, block_len;
-	loff_t first_page, last_page, first_page_offset, last_page_offset;
+	loff_t first_page, last_page, page_len;
+	loff_t first_page_offset, last_page_offset;
 	int ret, credits, blocks_released, err = 0;
 
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
 	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
 
-	first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
-	last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
-
 	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
 
@@ -4211,24 +4208,44 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 		goto out;
 
 	/*
-	 * Now we need to zero out the un block aligned data.
-	 * If the file is smaller than a block, just
-	 * zero out the middle
+	 * Now we need to zero out the non-page-aligned data in the
+	 * pages at the start and tail of the hole, and unmap the buffer
+	 * heads for the block aligned regions of the page that were
+	 * completely zeroed.
 	 */
-	if (first_block > last_block)
-		ext4_block_zero_page_range(handle, mapping, offset, length);
-	else {
-		/* zero out the head of the hole before the first block */
-		block_len  = first_block_offset - offset;
-		if (block_len > 0)
-			ext4_block_zero_page_range(handle, mapping,
-						   offset, block_len);
-
-		/* zero out the tail of the hole after the last block */
-		block_len = offset + length - last_block_offset;
-		if (block_len > 0) {
-			ext4_block_zero_page_range(handle, mapping,
-					last_block_offset, block_len);
+	if (first_page > last_page) {
+		/*
+		 * If the file space being truncated is contained within a page
+		 * just zero out and unmap the middle of that page
+		 */
+		err = ext4_discard_partial_page_buffers(handle,
+			mapping, offset, length, 0);
+
+		if (err)
+			goto out;
+	} else {
+		/*
+		 * zero out and unmap the partial page that contains
+		 * the start of the hole
+		 */
+		page_len  = first_page_offset - offset;
+		if (page_len > 0) {
+			err = ext4_discard_partial_page_buffers(handle, mapping,
+						   offset, page_len, 0);
+			if (err)
+				goto out;
+		}
+
+		/*
+		 * zero out and unmap the partial page that contains
+		 * the end of the hole
+		 */
+		page_len = offset + length - last_page_offset;
+		if (page_len > 0) {
+			err = ext4_discard_partial_page_buffers(handle, mapping,
+					last_page_offset, page_len, 0);
+			if (err)
+				goto out;
 		}
 	}
 
-- 
cgit v1.2.3


From 2be4751b21ae1cacb002da48cfc5bf6743fee8c1 Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Sat, 3 Sep 2011 11:56:52 -0400
Subject: ext4: fix 2nd xfstests 127 punch hole failure

This patch fixes a second punch hole bug found by xfstests 127.

This bug happens because punch hole needs to flush the pages
of the hole to avoid race conditions.  But if the end of the
hole is in the same page as i_size, the buffer heads beyond
i_size need to be unmapped and the page needs to be zeroed
after it is flushed.

To correct this, the new ext4_discard_partial_page_buffers
routine is used to zero and unmap the partial page
beyond i_size if the end of the hole appears in the same
page as i_size.

The code has also been optimized to set the end of the hole
to the page after i_size if the specified hole exceeds i_size,
and the code that flushes the pages has been simplified.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
---
 fs/ext4/extents.c | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 18f7e04a4fa3..9124cd24e093 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4166,6 +4166,20 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	loff_t first_page_offset, last_page_offset;
 	int ret, credits, blocks_released, err = 0;
 
+	/* No need to punch hole beyond i_size */
+	if (offset >= inode->i_size)
+		return 0;
+
+	/*
+	 * If the hole extends beyond i_size, set the hole
+	 * to end after the page that contains i_size
+	 */
+	if (offset + length > inode->i_size) {
+		length = inode->i_size +
+		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+		   offset;
+	}
+
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
 	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4182,11 +4196,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	 */
 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		err = filemap_write_and_wait_range(mapping,
-			first_page_offset == 0 ? 0 : first_page_offset-1,
-			last_page_offset);
+			offset, offset + length - 1);
 
-			if (err)
-				return err;
+		if (err)
+			return err;
 	}
 
 	/* Now release the pages */
@@ -4249,6 +4262,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 		}
 	}
 
+
+	/*
+	 * If i_size is contained in the last page, we need to
+	 * unmap and zero the partial page after i_size
+	 */
+	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+	   inode->i_size % PAGE_CACHE_SIZE != 0) {
+
+		page_len = PAGE_CACHE_SIZE -
+			(inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		if (page_len > 0) {
+			err = ext4_discard_partial_page_buffers(handle,
+			  mapping, inode->i_size, page_len, 0);
+
+			if (err)
+				goto out;
+		}
+	}
+
 	/* If there are no blocks to remove, return now */
 	if (first_block >= last_block)
 		goto out;
-- 
cgit v1.2.3


From 9ea7a0df63630ad8197716cd313ea66e28906fc0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 4 Sep 2011 10:18:14 -0400
Subject: jbd2: add debugging information to jbd2_journal_dirty_metadata()

Add debugging information in case jbd2_journal_dirty_metadata() is
called with a buffer_head which didn't have
jbd2_journal_get_write_access() called on it, or if the journal_head
has the wrong transaction in it.  In addition, return an error code.
This won't change anything for ocfs2, which will BUG_ON() the non-zero
exit code.

For ext4, the caller of this function is ext4_handle_dirty_metadata(),
and on seeing a non-zero return code, will call __ext4_journal_stop(),
which will print the function and line number of the (buggy) calling
function and abort the journal.  This will allow us to recover instead
of bug halting, which is better from a robustness and reliability
point of view.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c   |  8 ++++---
 fs/ext4/extents.c     | 10 ++++++---
 fs/jbd2/transaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 64 insertions(+), 12 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index f5240aa15601..aca179017582 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_dirty_metadata(handle, bh);
-		if (err)
-			ext4_journal_abort_handle(where, line, __func__,
-						  bh, handle, err);
+		if (err) {
+			/* Errors can only happen if there is a bug */
+			handle->h_err = err;
+			__ext4_journal_stop(where, line, handle);
+		}
 	} else {
 		if (inode)
 			mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9124cd24e093..2c5216a8d03b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -96,13 +96,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
  *  - ENOMEM
  *  - EIO
  */
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *path)
+#define ext4_ext_dirty(handle, inode, path) \
+		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+			    handle_t *handle, struct inode *inode,
+			    struct ext4_ext_path *path)
 {
 	int err;
 	if (path->p_bh) {
 		/* path points to block */
-		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+		err = __ext4_handle_dirty_metadata(where, line, handle,
+						   inode, path->p_bh);
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2d7109414cdd..cb56fe9aaabb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1049,6 +1049,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
  * mark dirty metadata which needs to be journaled as part of the current
  * transaction.
  *
+ * The buffer must have previously had jbd2_journal_get_write_access()
+ * called so that it has a valid journal_head attached to the buffer
+ * head.
+ *
  * The buffer is placed on the transaction's metadata list and is marked
  * as belonging to the transaction.
  *
@@ -1065,11 +1069,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
 	struct journal_head *jh = bh2jh(bh);
+	int ret = 0;
 
 	jbd_debug(5, "journal_head %p\n", jh);
 	JBUFFER_TRACE(jh, "entry");
 	if (is_handle_aborted(handle))
 		goto out;
+	if (!buffer_jbd(bh)) {
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	jbd_lock_bh_state(bh);
 
@@ -1093,8 +1102,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 */
 	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
 		JBUFFER_TRACE(jh, "fastpath");
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_running_transaction);
+		if (unlikely(jh->b_transaction !=
+			     journal->j_running_transaction)) {
+			printk(KERN_EMERG "JBD: %s: "
+			       "jh->b_transaction (%llu, %p, %u) != "
+			       "journal->j_running_transaction (%p, %u)",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_transaction,
+			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
+			       journal->j_running_transaction,
+			       journal->j_running_transaction ?
+			       journal->j_running_transaction->t_tid : 0);
+			ret = -EINVAL;
+		}
 		goto out_unlock_bh;
 	}
 
@@ -1108,9 +1129,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 */
 	if (jh->b_transaction != transaction) {
 		JBUFFER_TRACE(jh, "already on other transaction");
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+		if (unlikely(jh->b_transaction !=
+			     journal->j_committing_transaction)) {
+			printk(KERN_EMERG "JBD: %s: "
+			       "jh->b_transaction (%llu, %p, %u) != "
+			       "journal->j_committing_transaction (%p, %u)",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_transaction,
+			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
+			       journal->j_committing_transaction,
+			       journal->j_committing_transaction ?
+			       journal->j_committing_transaction->t_tid : 0);
+			ret = -EINVAL;
+		}
+		if (unlikely(jh->b_next_transaction != transaction)) {
+			printk(KERN_EMERG "JBD: %s: "
+			       "jh->b_next_transaction (%llu, %p, %u) != "
+			       "transaction (%p, %u)",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_next_transaction,
+			       jh->b_next_transaction ?
+			       jh->b_next_transaction->t_tid : 0,
+			       transaction, transaction->t_tid);
+			ret = -EINVAL;
+		}
 		/* And this case is illegal: we can't reuse another
 		 * transaction's data buffer, ever. */
 		goto out_unlock_bh;
@@ -1127,7 +1171,9 @@ out_unlock_bh:
 	jbd_unlock_bh_state(bh);
 out:
 	JBUFFER_TRACE(jh, "exit");
-	return 0;
+	if (ret)
+		__WARN();	/* All errors are bugs, so dump the stack */
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From 189e868fa8fdca702eb9db9d8afc46b5cb9144c9 Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Tue, 6 Sep 2011 21:49:44 -0400
Subject: ext4: fix fsx truncate failure

While running extended fsx tests to verify the first
two patches, a similar bug was also found in the
truncate operation.

This bug happens because the truncate routine only zeros
the unblock aligned portion of the last page.  This means
that the block aligned portions of the page appearing after
i_size are left unzeroed, and the buffer heads still mapped.

This bug is corrected by using ext4_discard_partial_page_buffers
in the truncate routine to zero the partial page and unmap
the buffer headers.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c  | 13 +++++++++++--
 fs/ext4/indirect.c | 13 +++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2c5216a8d03b..ba7bd5a176ce 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3653,6 +3653,7 @@ void ext4_ext_truncate(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
 	handle_t *handle;
+	loff_t page_len;
 	int err = 0;
 
 	/*
@@ -3669,8 +3670,16 @@ void ext4_ext_truncate(struct inode *inode)
 	if (IS_ERR(handle))
 		return;
 
-	if (inode->i_size & (sb->s_blocksize - 1))
-		ext4_block_truncate_page(handle, mapping, inode->i_size);
+	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+		page_len = PAGE_CACHE_SIZE -
+			(inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		err = ext4_discard_partial_page_buffers(handle,
+			mapping, inode->i_size, page_len, 0);
+
+		if (err)
+			goto out_stop;
+	}
 
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642119c0..ea704dff8669 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1343,7 +1343,9 @@ void ext4_ind_truncate(struct inode *inode)
 	__le32 nr = 0;
 	int n = 0;
 	ext4_lblk_t last_block, max_block;
+	loff_t page_len;
 	unsigned blocksize = inode->i_sb->s_blocksize;
+	int err;
 
 	handle = start_transaction(inode);
 	if (IS_ERR(handle))
@@ -1354,9 +1356,16 @@ void ext4_ind_truncate(struct inode *inode)
 	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (inode->i_size & (blocksize - 1))
-		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+		page_len = PAGE_CACHE_SIZE -
+			(inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		err = ext4_discard_partial_page_buffers(handle,
+			mapping, inode->i_size, page_len, 0);
+
+		if (err)
 			goto out_stop;
+	}
 
 	if (last_block != max_block) {
 		n = ext4_block_to_path(inode, last_block, offsets, NULL);
-- 
cgit v1.2.3


From 4d33b1ef10995d7ba6191d67456202c697a92a32 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:52:51 -0400
Subject: ext4: teach ext4_ext_map_blocks() about the bigalloc feature

If we need to allocate a new block in ext4_ext_map_blocks(), the
function needs to see if the cluster has already been allocated.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 162 insertions(+), 19 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ba7bd5a176ce..bd42ab29efec 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1270,7 +1270,8 @@ static int ext4_ext_search_left(struct inode *inode,
  */
 static int ext4_ext_search_right(struct inode *inode,
 				 struct ext4_ext_path *path,
-				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
+				 struct ext4_extent **ret_ex)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_extent_header *eh;
@@ -1312,9 +1313,7 @@ static int ext4_ext_search_right(struct inode *inode,
 				return -EIO;
 			}
 		}
-		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext4_ext_pblock(ex);
-		return 0;
+		goto found_extent;
 	}
 
 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1327,9 +1326,7 @@ static int ext4_ext_search_right(struct inode *inode,
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
 		/* next allocated block in this leaf */
 		ex++;
-		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext4_ext_pblock(ex);
-		return 0;
+		goto found_extent;
 	}
 
 	/* go up and search for index to the right */
@@ -1372,9 +1369,12 @@ got_index:
 		return -EIO;
 	}
 	ex = EXT_FIRST_EXTENT(eh);
+found_extent:
 	*logical = le32_to_cpu(ex->ee_block);
 	*phys = ext4_ext_pblock(ex);
-	put_bh(bh);
+	*ret_ex = ex;
+	if (bh)
+		put_bh(bh);
 	return 0;
 }
 
@@ -1627,7 +1627,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-static unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+					   struct inode *inode,
 					   struct ext4_extent *newext,
 					   struct ext4_ext_path *path)
 {
@@ -1641,6 +1642,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
 	if (!path[depth].p_ext)
 		goto out;
 	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+	b2 &= ~(sbi->s_cluster_ratio - 1);
 
 	/*
 	 * get the next allocated block if the extent in the path
@@ -1650,6 +1652,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
 		b2 = ext4_ext_next_allocated_block(path);
 		if (b2 == EXT_MAX_BLOCKS)
 			goto out;
+		b2 &= ~(sbi->s_cluster_ratio - 1);
 	}
 
 	/* check for wrap through zero on extent logical start block*/
@@ -3293,6 +3296,106 @@ out2:
 	return err ? err : allocated;
 }
 
+/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ *	@sbi	The ext4-specific superblock structure
+ *	@map	The requested lblk->pblk mapping
+ *	@ex	The extent structure which might contain an implied
+ *			cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree.  Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree.  There are three cases we
+ * want to catch.  The first is this case:
+ *
+ *		 |--- cluster # N--|
+ *    |--- extent ---|	|---- requested region ---|
+ *			|==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ *   |--------- cluster # N ----------------|
+ *	   |--- requested region --|   |------- extent ----|
+ *	   |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ *          |------------- cluster # N-------------|
+ * |----- ex -----|                  |---- ex_right ----|
+ *                  |------ requested region ------|
+ *                  |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region.  Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct ext4_sb_info *sbi,
+				     struct ext4_map_blocks *map,
+				     struct ext4_extent *ex,
+				     struct ext4_ext_path *path)
+{
+	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+	ext4_lblk_t ex_cluster_start, ex_cluster_end;
+	ext4_lblk_t rr_cluster_start, rr_cluster_end;
+	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+	unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+	/* The extent passed in that we are trying to match */
+	ex_cluster_start = EXT4_B2C(sbi, ee_block);
+	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+	/* The requested region passed into ext4_map_blocks() */
+	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
+
+	if ((rr_cluster_start == ex_cluster_end) ||
+	    (rr_cluster_start == ex_cluster_start)) {
+		if (rr_cluster_start == ex_cluster_end)
+			ee_start += ee_len - 1;
+		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+			c_offset;
+		map->m_len = min(map->m_len,
+				 (unsigned) sbi->s_cluster_ratio - c_offset);
+		/*
+		 * Check for and handle this case:
+		 *
+		 *   |--------- cluster # N-------------|
+		 *		       |------- extent ----|
+		 *	   |--- requested region ---|
+		 *	   |===========|
+		 */
+
+		if (map->m_lblk < ee_block)
+			map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+		/*
+		 * Check for the case where there is already another allocated
+		 * block to the right of 'ex' but before the end of the cluster.
+		 *
+		 *          |------------- cluster # N-------------|
+		 * |----- ex -----|                  |---- ex_right ----|
+		 *                  |------ requested region ------|
+		 *                  |================|
+		 */
+		if (map->m_lblk > ee_block) {
+			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+			map->m_len = min(map->m_len, next - map->m_lblk);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3315,14 +3418,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
-	struct ext4_extent newex, *ex;
+	struct ext4_extent newex, *ex, *ex2;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_fsblk_t newblock = 0;
-	int err = 0, depth, ret;
-	unsigned int allocated = 0;
+	int free_on_err = 0, err = 0, depth, ret;
+	unsigned int allocated = 0, offset = 0;
 	unsigned int punched_out = 0;
 	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	ext4_lblk_t cluster_offset;
 	struct ext4_map_blocks punch_map;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -3508,9 +3613,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
+
 	/*
 	 * Okay, we need to do block allocation.
 	 */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+
+	/*
+	 * If we are doing bigalloc, check to see if the extent returned
+	 * by ext4_ext_find_extent() implies a cluster we can use.
+	 */
+	if (cluster_offset && ex &&
+	    get_implied_cluster_alloc(sbi, map, ex, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		goto got_allocated_blocks;
+	}
 
 	/* find neighbour allocated blocks */
 	ar.lleft = map->m_lblk;
@@ -3518,10 +3637,20 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (err)
 		goto out2;
 	ar.lright = map->m_lblk;
-	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+	ex2 = NULL;
+	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
 	if (err)
 		goto out2;
 
+	/* Check if the extent after searching to the right implies a
+	 * cluster we can use. */
+	if ((sbi->s_cluster_ratio > 1) && ex2 &&
+	    get_implied_cluster_alloc(sbi, map, ex2, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		goto got_allocated_blocks;
+	}
+
 	/*
 	 * See if request is beyond maximum number of blocks we can have in
 	 * a single extent. For an initialized extent this limit is
@@ -3536,9 +3665,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		map->m_len = EXT_UNINIT_MAX_LEN;
 
 	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-	newex.ee_block = cpu_to_le32(map->m_lblk);
 	newex.ee_len = cpu_to_le16(map->m_len);
-	err = ext4_ext_check_overlap(inode, &newex, path);
+	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
 	if (err)
 		allocated = ext4_ext_get_actual_len(&newex);
 	else
@@ -3548,7 +3676,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ar.inode = inode;
 	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
 	ar.logical = map->m_lblk;
-	ar.len = allocated;
+	/*
+	 * We calculate the offset from the beginning of the cluster
+	 * for the logical block number, since when we allocate a
+	 * physical cluster, the physical block should start at the
+	 * same offset from the beginning of the cluster.  This is
+	 * needed so that future calls to get_implied_cluster_alloc()
+	 * work correctly.
+	 */
+	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+	ar.goal -= offset;
+	ar.logical -= offset;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
 	else
@@ -3561,9 +3700,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
 		  ar.goal, newblock, allocated);
+	free_on_err = 1;
+	ar.len = EXT4_C2B(sbi, ar.len) - offset;
+	if (ar.len > allocated)
+		ar.len = allocated;
 
+got_allocated_blocks:
 	/* try to insert new extent into found leaf and return */
-	ext4_ext_store_pblock(&newex, newblock);
+	ext4_ext_store_pblock(&newex, newblock + offset);
 	newex.ee_len = cpu_to_le16(ar.len);
 	/* Mark uninitialized */
 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3591,7 +3735,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext4_ext_insert_extent(handle, inode, path,
 					     &newex, flags);
-	if (err) {
+	if (err && free_on_err) {
 		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
 			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
 		/* free data blocks we just allocated */
@@ -4115,7 +4259,6 @@ found_delayed_extent:
 		return EXT_BREAK;
 	return EXT_CONTINUE;
 }
-
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
-- 
cgit v1.2.3


From 0aa060000e83ca3d09ddc446a7174fb0820d99bc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:54:51 -0400
Subject: ext4: teach ext4_ext_truncate() about the bigalloc feature

When we are truncating (as opposed unlinking) a file, we need to worry
about partial truncates of a file, especially in the light of sparse
files.  The changes here make sure that arbitrary truncates of sparse
files works correctly.  Yeah, it's messy.

Note that these functions will need to be revisted when the punch
ioctl is integrated --- in fact this commit will probably have merge
conflicts with the punch changes which Allison Henders and the IBM LTC
have been working on.  I will need to fix this up when either patch
hits mainline.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 76 insertions(+), 12 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bd42ab29efec..cd4479c08031 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2202,14 +2202,39 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
-				struct ext4_extent *ex,
-				ext4_lblk_t from, ext4_lblk_t to)
+			      struct ext4_extent *ex,
+			      ext4_fsblk_t *partial_cluster,
+			      ext4_lblk_t from, ext4_lblk_t to)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
+	ext4_fsblk_t pblk;
 	int flags = EXT4_FREE_BLOCKS_FORGET;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
+	/*
+	 * For bigalloc file systems, we never free a partial cluster
+	 * at the beginning of the extent.  Instead, we make a note
+	 * that we tried freeing the cluster, and check to see if we
+	 * need to free it on a subsequent call to ext4_remove_blocks,
+	 * or at the end of the ext4_truncate() operation.
+	 */
+	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+	/*
+	 * If we have a partial cluster, and it's different from the
+	 * cluster of the last block, we need to explicitly free the
+	 * partial cluster here.
+	 */
+	pblk = ext4_ext_pblock(ex) + ee_len - 1;
+	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
+
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2229,12 +2254,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		ext4_lblk_t num;
-		ext4_fsblk_t start;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		start = ext4_ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, NULL, start, num, flags);
+		pblk = ext4_ext_pblock(ex) + ee_len - num;
+		ext_debug("free last %u blocks starting %llu\n", num, pblk);
+		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+		/*
+		 * If the block range to be freed didn't start at the
+		 * beginning of a cluster, and we removed the entire
+		 * extent, save the partial cluster here, since we
+		 * might need to delete if we determine that the
+		 * truncate operation has removed all of the blocks in
+		 * the cluster.
+		 */
+		if (pblk & (sbi->s_cluster_ratio - 1) &&
+		    (ee_len == num))
+			*partial_cluster = EXT4_B2C(sbi, pblk);
+		else
+			*partial_cluster = 0;
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* head removal */
@@ -2269,9 +2306,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path, ext4_lblk_t start,
-		ext4_lblk_t end)
+		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+		 ext4_lblk_t start, ext4_lblk_t end)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
 	struct ext4_extent_header *eh;
@@ -2423,7 +2461,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		err = ext4_remove_blocks(handle, inode, ex, a, b);
+		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+					 a, b);
 		if (err)
 			goto out;
 
@@ -2471,7 +2510,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 					sizeof(struct ext4_extent));
 			}
 			le16_add_cpu(&eh->eh_entries, -1);
-		}
+		} else
+			*partial_cluster = 0;
 
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
 				ext4_ext_pblock(ex));
@@ -2483,6 +2523,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	if (correct_index && eh->eh_entries)
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
+	/*
+	 * If there is still a entry in the leaf node, check to see if
+	 * it references the partial cluster.  This is the only place
+	 * where it could; if it doesn't, we can free the cluster.
+	 */
+	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+	     *partial_cluster)) {
+		int flags = EXT4_FREE_BLOCKS_FORGET;
+
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			flags |= EXT4_FREE_BLOCKS_METADATA;
+
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
+
 	/* if this leaf is free, then we should
 	 * remove it from index block above */
 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2518,6 +2577,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path;
+	ext4_fsblk_t partial_cluster = 0;
 	handle_t *handle;
 	int i, err;
 
@@ -2553,7 +2613,8 @@ again:
 		if (i == depth) {
 			/* this is leaf block */
 			err = ext4_ext_rm_leaf(handle, inode, path,
-					start, EXT_MAX_BLOCKS - 1);
+					       &partial_cluster, start,
+					       EXT_MAX_BLOCKS - 1);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -3495,6 +3556,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			ext4_fsblk_t partial_cluster = 0;
+
 			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (map->m_lblk - ee_block);
@@ -3578,7 +3641,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			ext4_ext_invalidate_cache(inode);
 
 			err = ext4_ext_rm_leaf(handle, inode, path,
-				map->m_lblk, map->m_lblk + punched_out);
+					       &partial_cluster, map->m_lblk,
+					       map->m_lblk + punched_out);
 
 			if (!err && path->p_hdr->eh_entries == 0) {
 				/*
-- 
cgit v1.2.3


From 7b415bf60f6afb0499fd3dc0ee33444f54e28567 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 9 Sep 2011 19:04:51 -0400
Subject: ext4: Fix bigalloc quota accounting and i_blocks value

With bigalloc changes, the i_blocks value was not correctly set (it was still
set to number of blocks being used, but in case of bigalloc, we want i_blocks
to represent the number of clusters being used). Since the quota subsystem sets
the i_blocks value, this patch fixes the quota accounting and makes sure that
the i_blocks value is set correctly.

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c       |   5 +-
 fs/ext4/ext4.h         |  16 ++-
 fs/ext4/ext4_extents.h |   2 +
 fs/ext4/extents.c      | 306 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/inode.c        |  54 ++++++---
 fs/ext4/mballoc.c      |   5 +-
 fs/ext4/super.c        |   3 +-
 7 files changed, 366 insertions(+), 25 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9080a857cda9..bf42b3219e3c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -485,7 +485,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:		pointer to total number of blocks needed
+ * @count:		pointer to total number of clusters needed
  * @errp:               error code
  *
  * Return 1st allocated block number on success, *count stores total account
@@ -517,7 +517,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-		dquot_alloc_block_nofail(inode, ar.len);
+		dquot_alloc_block_nofail(inode,
+				EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
 	}
 	return ret;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d2584224c89a..a6307f7c9807 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
 #define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
 #define EXT4_MAP_UNINIT		(1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER	(1 << BH_AllocFromCluster)
 #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
 				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
-				 EXT4_MAP_UNINIT)
+				 EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
 
 struct ext4_map_blocks {
 	ext4_fsblk_t m_pblk;
@@ -1884,6 +1892,7 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
+extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2284,6 +2293,11 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
 	  = BH_JBDPrivateStart,
+	BH_AllocFromCluster,	/* allocated blocks were part of already
+				 * allocated cluster. Note that this flag will
+				 * never, ever appear in a buffer_head's state
+				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
+				 * this is used. */
 };
 
 BUFFER_FNS(Uninit, uninit)
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 095c36f3b612..a52db3a69a30 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+				      int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cd4479c08031..c4e005864534 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2686,6 +2686,21 @@ again:
 		}
 	}
 
+	/* If we still have something in the partial cluster and we have removed
+	 * even the first extent, then we should free the blocks in the partial
+	 * cluster as well. */
+	if (partial_cluster && path->p_hdr->eh_entries == 0) {
+		int flags = EXT4_FREE_BLOCKS_FORGET;
+
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			flags |= EXT4_FREE_BLOCKS_METADATA;
+
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
+				 EXT4_SB(sb)->s_cluster_ratio, flags);
+		partial_cluster = 0;
+	}
+
 	/* TODO: flexible tree reduction should be here */
 	if (path->p_hdr->eh_entries == 0) {
 		/*
@@ -3233,6 +3248,195 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 	return ext4_mark_inode_dirty(handle, inode);
 }
 
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
+ */
+static int ext4_find_delalloc_range(struct inode *inode,
+				    ext4_lblk_t lblk_start,
+				    ext4_lblk_t lblk_end,
+				    int search_hint_reverse)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *head, *bh = NULL;
+	struct page *page;
+	ext4_lblk_t i, pg_lblk;
+	pgoff_t index;
+
+	/* reverse search wont work if fs block size is less than page size */
+	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+		search_hint_reverse = 0;
+
+	if (search_hint_reverse)
+		i = lblk_end;
+	else
+		i = lblk_start;
+
+	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	while ((i >= lblk_start) && (i <= lblk_end)) {
+		page = find_get_page(mapping, index);
+		if (!page || !PageDirty(page))
+			goto nextpage;
+
+		if (PageWriteback(page)) {
+			/*
+			 * This might be a race with allocation and writeout. In
+			 * this case we just assume that the rest of the range
+			 * will eventually be written and there wont be any
+			 * delalloc blocks left.
+			 * TODO: the above assumption is troublesome, but might
+			 * work better in practice. other option could be note
+			 * somewhere that the cluster is getting written out and
+			 * detect that here.
+			 */
+			page_cache_release(page);
+			return 0;
+		}
+
+		if (!page_has_buffers(page))
+			goto nextpage;
+
+		head = page_buffers(page);
+		if (!head)
+			goto nextpage;
+
+		bh = head;
+		pg_lblk = index << (PAGE_CACHE_SHIFT -
+						inode->i_blkbits);
+		do {
+			if (unlikely(pg_lblk < lblk_start)) {
+				/*
+				 * This is possible when fs block size is less
+				 * than page size and our cluster starts/ends in
+				 * middle of the page. So we need to skip the
+				 * initial few blocks till we reach the 'lblk'
+				 */
+				pg_lblk++;
+				continue;
+			}
+
+			if (buffer_delay(bh)) {
+				page_cache_release(page);
+				return 1;
+			}
+			if (search_hint_reverse)
+				i--;
+			else
+				i++;
+		} while ((i >= lblk_start) && (i <= lblk_end) &&
+				((bh = bh->b_this_page) != head));
+nextpage:
+		if (page)
+			page_cache_release(page);
+		/*
+		 * Move to next page. 'i' will be the first lblk in the next
+		 * page.
+		 */
+		if (search_hint_reverse)
+			index--;
+		else
+			index++;
+		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	}
+
+	return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+			       int search_hint_reverse)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t lblk_start, lblk_end;
+	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+					search_hint_reverse);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ *  '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ *     are not delalloc'ed.
+ *	Ex:
+ *	|----c---=|====c====|====c====|===-c----|
+ *	         |++++++ allocated ++++++|
+ *	==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ *     marked for delayed allocation. In this case, we will exclude that
+ *     cluster.
+ *	Ex:
+ *	|----====c========|========c========|
+ *	     |++++++ allocated ++++++|
+ *	==> 1 complete clusters in above example
+ *
+ *	Ex:
+ *	|================c================|
+ *            |++++++ allocated ++++++|
+ *	==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+			   unsigned int num_blks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+	ext4_lblk_t lblk_from, lblk_to, c_offset;
+	unsigned int allocated_clusters = 0;
+
+	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+	/* max possible clusters for this allocation */
+	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+	/* Check towards left side */
+	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+	if (c_offset) {
+		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+		lblk_to = lblk_from + c_offset - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+			allocated_clusters--;
+	}
+
+	/* Now check towards right. */
+	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+	if (allocated_clusters && c_offset) {
+		lblk_from = lblk_start + num_blks;
+		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+			allocated_clusters--;
+	}
+
+	return allocated_clusters;
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
@@ -3338,8 +3542,15 @@ out:
 	 * But fallocate would have already updated quota and block
 	 * count for this offset. So cancel these reservation
 	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		ext4_da_update_reserve_space(inode, allocated, 0);
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+				map->m_lblk, map->m_len);
+		if (reserved_clusters)
+			ext4_da_update_reserve_space(inode,
+						     reserved_clusters,
+						     0);
+	}
 
 map_out:
 	map->m_flags |= EXT4_MAP_MAPPED;
@@ -3484,6 +3695,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock = 0;
 	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
+	unsigned int allocated_clusters = 0, reserved_clusters = 0;
 	unsigned int punched_out = 0;
 	unsigned int result = 0;
 	struct ext4_allocation_request ar;
@@ -3499,6 +3711,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
 		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
+			if ((sbi->s_cluster_ratio > 1) &&
+			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
 				 * block isn't allocated yet and
@@ -3509,6 +3725,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			/* we should allocate requested block */
 		} else {
 			/* block is already allocated */
+			if (sbi->s_cluster_ratio > 1)
+				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
 				   + ext4_ext_pblock(&newex);
@@ -3665,6 +3883,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if ((sbi->s_cluster_ratio > 1) &&
+	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
 	/*
 	 * requested block isn't allocated yet;
 	 * we couldn't try to create block if create flag is zero
@@ -3681,6 +3903,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	 */
+	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
 	newex.ee_block = cpu_to_le32(map->m_lblk);
 	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
 
@@ -3692,6 +3915,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	    get_implied_cluster_alloc(sbi, map, ex, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 		goto got_allocated_blocks;
 	}
 
@@ -3712,6 +3936,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	    get_implied_cluster_alloc(sbi, map, ex2, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 		goto got_allocated_blocks;
 	}
 
@@ -3765,6 +3990,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
 		  ar.goal, newblock, allocated);
 	free_on_err = 1;
+	allocated_clusters = ar.len;
 	ar.len = EXT4_C2B(sbi, ar.len) - offset;
 	if (ar.len > allocated)
 		ar.len = allocated;
@@ -3822,8 +4048,80 @@ got_allocated_blocks:
 	 * Update reserved blocks/metadata blocks after successful
 	 * block allocation which had been deferred till now.
 	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		ext4_da_update_reserve_space(inode, allocated, 1);
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		/*
+		 * Check how many clusters we had reserved this allocted range.
+		 */
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+						map->m_lblk, allocated);
+		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+			if (reserved_clusters) {
+				/*
+				 * We have clusters reserved for this range.
+				 * But since we are not doing actual allocation
+				 * and are simply using blocks from previously
+				 * allocated cluster, we should release the
+				 * reservation and not claim quota.
+				 */
+				ext4_da_update_reserve_space(inode,
+						reserved_clusters, 0);
+			}
+		} else {
+			BUG_ON(allocated_clusters < reserved_clusters);
+			/* We will claim quota for all newly allocated blocks.*/
+			ext4_da_update_reserve_space(inode, allocated_clusters,
+							1);
+			if (reserved_clusters < allocated_clusters) {
+				int reservation = allocated_clusters -
+						  reserved_clusters;
+				/*
+				 * It seems we claimed few clusters outside of
+				 * the range of this allocation. We should give
+				 * it back to the reservation pool. This can
+				 * happen in the following case:
+				 *
+				 * * Suppose s_cluster_ratio is 4 (i.e., each
+				 *   cluster has 4 blocks. Thus, the clusters
+				 *   are [0-3],[4-7],[8-11]...
+				 * * First comes delayed allocation write for
+				 *   logical blocks 10 & 11. Since there were no
+				 *   previous delayed allocated blocks in the
+				 *   range [8-11], we would reserve 1 cluster
+				 *   for this write.
+				 * * Next comes write for logical blocks 3 to 8.
+				 *   In this case, we will reserve 2 clusters
+				 *   (for [0-3] and [4-7]; and not for [8-11] as
+				 *   that range has a delayed allocated blocks.
+				 *   Thus total reserved clusters now becomes 3.
+				 * * Now, during the delayed allocation writeout
+				 *   time, we will first write blocks [3-8] and
+				 *   allocate 3 clusters for writing these
+				 *   blocks. Also, we would claim all these
+				 *   three clusters above.
+				 * * Now when we come here to writeout the
+				 *   blocks [10-11], we would expect to claim
+				 *   the reservation of 1 cluster we had made
+				 *   (and we would claim it since there are no
+				 *   more delayed allocated blocks in the range
+				 *   [8-11]. But our reserved cluster count had
+				 *   already gone to 0.
+				 *
+				 *   Thus, at the step 4 above when we determine
+				 *   that there are still some unwritten delayed
+				 *   allocated blocks outside of our current
+				 *   block range, we should increment the
+				 *   reserved clusters count so that when the
+				 *   remaining blocks finally gets written, we
+				 *   could claim them.
+				 */
+				while (reservation) {
+					ext4_da_reserve_space(inode,
+							      map->m_lblk);
+					reservation--;
+				}
+			}
+		}
+	}
 
 	/*
 	 * Cache the extent and update transaction to commit on fdatasync only
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40f51aae42fe..d1c17e47c1c6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -300,14 +300,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
 	/* Update quota subsystem for data blocks */
 	if (quota_claim)
-		dquot_claim_block(inode, used);
+		dquot_claim_block(inode, EXT4_C2B(sbi, used));
 	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
 		 * not re-claim the quota for fallocated blocks.
 		 */
-		dquot_release_reservation_block(inode, used);
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
 	}
 
 	/*
@@ -1037,14 +1037,14 @@ static int ext4_journalled_write_end(struct file *file,
 }
 
 /*
- * Reserve a single block located at lblock
+ * Reserve a single cluster located at lblock
  */
-static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long md_needed;
+	unsigned int md_needed;
 	int ret;
 
 	/*
@@ -1054,7 +1054,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
-	md_needed = ext4_calc_metadata_amount(inode, lblock);
+	md_needed = EXT4_NUM_B2C(sbi,
+				 ext4_calc_metadata_amount(inode, lblock));
 	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 
@@ -1063,7 +1064,7 @@ repeat:
 	 * us from metadata over-estimation, though we may go over by
 	 * a small amount in the end.  Here we just reserve for data.
 	 */
-	ret = dquot_reserve_block(inode, 1);
+	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
 	if (ret)
 		return ret;
 	/*
@@ -1071,7 +1072,7 @@ repeat:
 	 * we cannot afford to run out of free blocks.
 	 */
 	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
-		dquot_release_reservation_block(inode, 1);
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1118,6 +1119,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * We can release all of the reserved metadata blocks
 		 * only when we have written all of the delayed
 		 * allocation blocks.
+		 * Note that in case of bigalloc, i_reserved_meta_blocks,
+		 * i_reserved_data_blocks, etc. refer to number of clusters.
 		 */
 		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 				   ei->i_reserved_meta_blocks);
@@ -1130,7 +1133,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	dquot_release_reservation_block(inode, to_free);
+	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1142,9 @@ static void ext4_da_page_release_reservation(struct page *page,
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
+	struct inode *inode = page->mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int num_clusters;
 
 	head = page_buffers(page);
 	bh = head;
@@ -1151,7 +1157,20 @@ static void ext4_da_page_release_reservation(struct page *page,
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
-	ext4_da_release_space(page->mapping->host, to_release);
+
+	/* If we have released all the blocks belonging to a cluster, then we
+	 * need to release the reserved space for that cluster. */
+	num_clusters = EXT4_NUM_B2C(sbi, to_release);
+	while (num_clusters > 0) {
+		ext4_fsblk_t lblk;
+		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+			((num_clusters - 1) << sbi->s_cluster_bits);
+		if (sbi->s_cluster_ratio == 1 ||
+		    !ext4_find_delalloc_cluster(inode, lblk, 1))
+			ext4_da_release_space(inode, 1);
+
+		num_clusters--;
+	}
 }
 
 /*
@@ -1352,7 +1371,8 @@ static void ext4_print_free_blocks(struct inode *inode)
 	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
 		percpu_counter_sum(&sbi->s_freeclusters_counter)));
 	printk(KERN_CRIT "dirty_blocks=%lld\n",
-	       (long long) percpu_counter_sum(&sbi->s_dirtyclusters_counter));
+	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 	printk(KERN_CRIT "Block reservation details\n");
 	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_data_blocks);
@@ -1626,10 +1646,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		/*
 		 * XXX: __block_write_begin() unmaps passed block, is it OK?
 		 */
-		ret = ext4_da_reserve_space(inode, iblock);
-		if (ret)
-			/* not enough space to reserve */
-			return ret;
+		/* If the block was allocated from previously allocated cluster,
+		 * then we dont need to reserve it again. */
+		if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
+			ret = ext4_da_reserve_space(inode, iblock);
+			if (ret)
+				/* not enough space to reserve */
+				return ret;
+		}
 
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 63dd56703342..5e1215d38331 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4718,6 +4718,9 @@ do_more:
 
 	freed += count;
 
+	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4736,8 +4739,6 @@ do_more:
 	}
 	ext4_mark_super_dirty(sb);
 error_return:
-	if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-		dquot_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
 	return;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6810957e0ac7..66b8cfa15636 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2473,7 +2473,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
 					      char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(s64) percpu_counter_sum(&sbi->s_dirtyclusters_counter));
+		(s64) EXT4_C2B(sbi,
+			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 }
 
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
-- 
cgit v1.2.3


From d8990240d8c911064447f8aa5a440f9345a6d692 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 9 Sep 2011 19:18:51 -0400
Subject: ext4: add some tracepoints in ext4/extents.c

This patch adds some tracepoints in ext4/extents.c and updates a tracepoint in
ext4/inode.c.

Tested: Built and ran the kernel and verified that these tracepoints work.
Also ran xfstests.

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h              |   2 +
 fs/ext4/extents.c           |  44 ++++-
 fs/ext4/inode.c             |   3 +-
 fs/ext4/migrate.c           |   1 -
 fs/ext4/move_extent.c       |   1 -
 include/trace/events/ext4.h | 398 +++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 433 insertions(+), 16 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 21ea65d8bd46..751277a4890c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2339,4 +2339,6 @@ extern void ext4_resize_end(struct super_block *sb);
 
 #endif	/* __KERNEL__ */
 
+#include "ext4_extents.h"
+
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c4e005864534..9b119308daea 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -42,7 +42,6 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 #include <trace/events/ext4.h>
 
@@ -1969,6 +1968,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 	struct ext4_ext_cache *cex;
 	BUG_ON(len == 0);
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	trace_ext4_ext_put_in_cache(inode, block, len, start);
 	cex = &EXT4_I(inode)->i_cached_extent;
 	cex->ec_block = block;
 	cex->ec_len = len;
@@ -2070,6 +2070,7 @@ errout:
 		sbi->extent_cache_misses++;
 	else
 		sbi->extent_cache_hits++;
+	trace_ext4_ext_in_cache(inode, block, ret);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	return ret;
 }
@@ -2137,6 +2138,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	if (err)
 		return err;
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
+	trace_ext4_ext_rm_idx(inode, leaf);
+
 	ext4_free_blocks(handle, inode, NULL, leaf, 1,
 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return err;
@@ -2222,6 +2225,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	 */
 	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
 
+	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
 	/*
 	 * If we have a partial cluster, and it's different from the
 	 * cluster of the last block, we need to explicitly free the
@@ -2336,6 +2340,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
+	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
 
@@ -2591,6 +2597,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 again:
 	ext4_ext_invalidate_cache(inode);
 
+	trace_ext4_ext_remove_space(inode, start, depth);
+
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
@@ -2686,6 +2694,9 @@ again:
 		}
 	}
 
+	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+			path->p_hdr->eh_entries);
+
 	/* If we still have something in the partial cluster and we have removed
 	 * even the first extent, then we should free the blocks in the partial
 	 * cluster as well. */
@@ -3300,6 +3311,10 @@ static int ext4_find_delalloc_range(struct inode *inode,
 			 * detect that here.
 			 */
 			page_cache_release(page);
+			trace_ext4_find_delalloc_range(inode,
+					lblk_start, lblk_end,
+					search_hint_reverse,
+					0, i);
 			return 0;
 		}
 
@@ -3327,6 +3342,10 @@ static int ext4_find_delalloc_range(struct inode *inode,
 
 			if (buffer_delay(bh)) {
 				page_cache_release(page);
+				trace_ext4_find_delalloc_range(inode,
+						lblk_start, lblk_end,
+						search_hint_reverse,
+						1, i);
 				return 1;
 			}
 			if (search_hint_reverse)
@@ -3349,6 +3368,8 @@ nextpage:
 		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	}
 
+	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+					search_hint_reverse, 0, 0);
 	return 0;
 }
 
@@ -3414,6 +3435,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 	/* max possible clusters for this allocation */
 	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
 
+	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
 	/* Check towards left side */
 	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
 	if (c_offset) {
@@ -3453,6 +3476,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
+	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+						    newblock);
+
 	/* get_block() before submit the IO, split the extent */
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 		ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3572,7 +3598,7 @@ out2:
  * get_implied_cluster_alloc - check to see if the requested
  * allocation (in the map structure) overlaps with a cluster already
  * allocated in an extent.
- *	@sbi	The ext4-specific superblock structure
+ *	@sb	The filesystem superblock structure
  *	@map	The requested lblk->pblk mapping
  *	@ex	The extent structure which might contain an implied
  *			cluster allocation
@@ -3609,11 +3635,12 @@ out2:
  * ext4_ext_map_blocks() will then allocate one or more new clusters
  * by calling ext4_mb_new_blocks().
  */
-static int get_implied_cluster_alloc(struct ext4_sb_info *sbi,
+static int get_implied_cluster_alloc(struct super_block *sb,
 				     struct ext4_map_blocks *map,
 				     struct ext4_extent *ex,
 				     struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
 	ext4_lblk_t rr_cluster_start, rr_cluster_end;
@@ -3662,8 +3689,12 @@ static int get_implied_cluster_alloc(struct ext4_sb_info *sbi,
 			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
 			map->m_len = min(map->m_len, next - map->m_lblk);
 		}
+
+		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
 		return 1;
 	}
+
+	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
 	return 0;
 }
 
@@ -3772,6 +3803,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 * we split out initialized portions during a write.
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
+
+		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
 			ext4_fsblk_t partial_cluster = 0;
@@ -3912,7 +3946,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * by ext4_ext_find_extent() implies a cluster we can use.
 	 */
 	if (cluster_offset && ex &&
-	    get_implied_cluster_alloc(sbi, map, ex, path)) {
+	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
 		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
@@ -3933,7 +3967,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	/* Check if the extent after searching to the right implies a
 	 * cluster we can use. */
 	if ((sbi->s_cluster_ratio > 1) && ex2 &&
-	    get_implied_cluster_alloc(sbi, map, ex2, path)) {
+	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
 		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88dc63a01756..2dcd4fed96ec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "ext4_extents.h"
 #include "truncate.h"
 
 #include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
 	spin_lock(&ei->i_block_reservation_lock);
-	trace_ext4_da_update_reserve_space(inode, used);
+	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks\n",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d1..6f07a06f2437 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f57455a1b1b2..c5826c623e7a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,7 +17,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 #include "ext4.h"
 
 /**
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index b50a54736242..c9a341e385a3 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -12,6 +12,8 @@ struct ext4_allocation_request;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
+struct ext4_map_blocks;
+struct ext4_extent;
 
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
@@ -1032,9 +1034,9 @@ TRACE_EVENT(ext4_forget,
 );
 
 TRACE_EVENT(ext4_da_update_reserve_space,
-	TP_PROTO(struct inode *inode, int used_blocks),
+	TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),
 
-	TP_ARGS(inode, used_blocks),
+	TP_ARGS(inode, used_blocks, quota_claim),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -1045,6 +1047,7 @@ TRACE_EVENT(ext4_da_update_reserve_space,
 		__field(	int,	reserved_data_blocks	)
 		__field(	int,	reserved_meta_blocks	)
 		__field(	int,	allocated_meta_blocks	)
+		__field(	int,	quota_claim		)
 	),
 
 	TP_fast_assign(
@@ -1053,19 +1056,24 @@ TRACE_EVENT(ext4_da_update_reserve_space,
 		__entry->mode	= inode->i_mode;
 		__entry->i_blocks = inode->i_blocks;
 		__entry->used_blocks = used_blocks;
-		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
-		__entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
+		__entry->reserved_data_blocks =
+				EXT4_I(inode)->i_reserved_data_blocks;
+		__entry->reserved_meta_blocks =
+				EXT4_I(inode)->i_reserved_meta_blocks;
+		__entry->allocated_meta_blocks =
+				EXT4_I(inode)->i_allocated_meta_blocks;
+		__entry->quota_claim = quota_claim;
 	),
 
 	TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
 		  "reserved_data_blocks %d reserved_meta_blocks %d "
-		  "allocated_meta_blocks %d",
+		  "allocated_meta_blocks %d quota_claim %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->i_blocks,
 		  __entry->used_blocks, __entry->reserved_data_blocks,
-		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
+		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks,
+		  __entry->quota_claim)
 );
 
 TRACE_EVENT(ext4_da_reserve_space,
@@ -1589,6 +1597,382 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free,
 	TP_ARGS(sb, group, start, len)
 );
 
+TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 unsigned int allocated, ext4_fsblk_t newblock),
+
+	TP_ARGS(inode, map, allocated, newblock),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	ext4_lblk_t,	lblk		)
+		__field(	ext4_fsblk_t,	pblk		)
+		__field(	unsigned int,	len		)
+		__field(	int,		flags		)
+		__field(	unsigned int,	allocated	)
+		__field(	ext4_fsblk_t,	newblk		)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->lblk		= map->m_lblk;
+		__entry->pblk		= map->m_pblk;
+		__entry->len		= map->m_len;
+		__entry->flags		= map->m_flags;
+		__entry->allocated	= allocated;
+		__entry->newblk		= newblock;
+	),
+
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d"
+		  "allocated %d newblock %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
+		  __entry->len, __entry->flags,
+		  (unsigned int) __entry->allocated,
+		  (unsigned long long) __entry->newblk)
+);
+
+TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
+	TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),
+
+	TP_ARGS(sb, map, ret),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	ext4_fsblk_t,	pblk	)
+		__field(	unsigned int,	len	)
+		__field(	unsigned int,	flags	)
+		__field(	int,		ret	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->lblk	= map->m_lblk;
+		__entry->pblk	= map->m_pblk;
+		__entry->len	= map->m_len;
+		__entry->flags	= map->m_flags;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lblk, (unsigned long long) __entry->pblk,
+		  __entry->len, __entry->flags, __entry->ret)
+);
+
+TRACE_EVENT(ext4_ext_put_in_cache,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len,
+		 ext4_fsblk_t start),
+
+	TP_ARGS(inode, lblk, len, start),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	unsigned int,	len	)
+		__field(	ext4_fsblk_t,	start	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->len	= len;
+		__entry->start	= start;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  __entry->len,
+		  (unsigned long long) __entry->start)
+);
+
+TRACE_EVENT(ext4_ext_in_cache,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret),
+
+	TP_ARGS(inode, lblk, ret),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	int,		ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  __entry->ret)
+
+);
+
+TRACE_EVENT(ext4_find_delalloc_range,
+	TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to,
+		int reverse, int found, ext4_lblk_t found_blk),
+
+	TP_ARGS(inode, from, to, reverse, found, found_blk),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	ext4_lblk_t,	from		)
+		__field(	ext4_lblk_t,	to		)
+		__field(	int,		reverse		)
+		__field(	int,		found		)
+		__field(	ext4_lblk_t,	found_blk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->from		= from;
+		__entry->to		= to;
+		__entry->reverse	= reverse;
+		__entry->found		= found;
+		__entry->found_blk	= found_blk;
+	),
+
+	TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d "
+		  "(blk = %u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->from, (unsigned) __entry->to,
+		  __entry->reverse, __entry->found,
+		  (unsigned) __entry->found_blk)
+);
+
+TRACE_EVENT(ext4_get_reserved_cluster_alloc,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len),
+
+	TP_ARGS(inode, lblk, len),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	unsigned int,	len	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->len	= len;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  __entry->len)
+);
+
+TRACE_EVENT(ext4_ext_show_extent,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+		 unsigned short len),
+
+	TP_ARGS(inode, lblk, pblk, len),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	ext4_fsblk_t,	pblk	)
+		__field(	unsigned short,	len	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->pblk	= pblk;
+		__entry->len	= len;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  (unsigned long long) __entry->pblk,
+		  (unsigned short) __entry->len)
+);
+
+TRACE_EVENT(ext4_remove_blocks,
+	    TP_PROTO(struct inode *inode, struct ext4_extent *ex,
+		ext4_lblk_t from, ext4_fsblk_t to,
+		ext4_fsblk_t partial_cluster),
+
+	TP_ARGS(inode, ex, from, to, partial_cluster),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	ee_lblk	)
+		__field(	ext4_fsblk_t,	ee_pblk	)
+		__field(	unsigned short,	ee_len	)
+		__field(	ext4_lblk_t,	from	)
+		__field(	ext4_lblk_t,	to	)
+		__field(	ext4_fsblk_t,	partial	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ee_lblk	= cpu_to_le32(ex->ee_block);
+		__entry->ee_pblk	= ext4_ext_pblock(ex);
+		__entry->ee_len		= ext4_ext_get_actual_len(ex);
+		__entry->from		= from;
+		__entry->to		= to;
+		__entry->partial	= partial_cluster;
+	),
+
+	TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
+		  "from %u to %u partial_cluster %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->ee_lblk,
+		  (unsigned long long) __entry->ee_pblk,
+		  (unsigned short) __entry->ee_len,
+		  (unsigned) __entry->from,
+		  (unsigned) __entry->to,
+		  (unsigned) __entry->partial)
+);
+
+TRACE_EVENT(ext4_ext_rm_leaf,
+	TP_PROTO(struct inode *inode, ext4_lblk_t start,
+		 struct ext4_extent *ex, ext4_fsblk_t partial_cluster),
+
+	TP_ARGS(inode, start, ex, partial_cluster),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	start	)
+		__field(	ext4_lblk_t,	ee_lblk	)
+		__field(	ext4_fsblk_t,	ee_pblk	)
+		__field(	short,		ee_len	)
+		__field(	ext4_fsblk_t,	partial	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->start		= start;
+		__entry->ee_lblk	= le32_to_cpu(ex->ee_block);
+		__entry->ee_pblk	= ext4_ext_pblock(ex);
+		__entry->ee_len		= ext4_ext_get_actual_len(ex);
+		__entry->partial	= partial_cluster;
+	),
+
+	TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
+		  "partial_cluster %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->start,
+		  (unsigned) __entry->ee_lblk,
+		  (unsigned long long) __entry->ee_pblk,
+		  (unsigned short) __entry->ee_len,
+		  (unsigned) __entry->partial)
+);
+
+TRACE_EVENT(ext4_ext_rm_idx,
+	TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),
+
+	TP_ARGS(inode, pblk),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_fsblk_t,	pblk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->pblk	= pblk;
+	),
+
+	TP_printk("dev %d,%d ino %lu index_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long) __entry->pblk)
+);
+
+TRACE_EVENT(ext4_ext_remove_space,
+	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth),
+
+	TP_ARGS(inode, start, depth),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	start	)
+		__field(	int,		depth	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->start	= start;
+		__entry->depth	= depth;
+	),
+
+	TP_printk("dev %d,%d ino %lu since %u depth %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->start,
+		  __entry->depth)
+);
+
+TRACE_EVENT(ext4_ext_remove_space_done,
+	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
+		ext4_lblk_t partial, unsigned short eh_entries),
+
+	TP_ARGS(inode, start, depth, partial, eh_entries),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	ext4_lblk_t,	start		)
+		__field(	int,		depth		)
+		__field(	ext4_lblk_t,	partial		)
+		__field(	unsigned short,	eh_entries	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->start		= start;
+		__entry->depth		= depth;
+		__entry->partial	= partial;
+		__entry->eh_entries	= eh_entries;
+	),
+
+	TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
+		  "remaining_entries %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->start,
+		  __entry->depth,
+		  (unsigned) __entry->partial,
+		  (unsigned short) __entry->eh_entries)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 5356f2615cd558c57a1f7d7528d1ad4de3640d96 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 9 Sep 2011 19:20:51 -0400
Subject: ext4: attempt to fix race in bigalloc code path

Currently, there exists a race between delayed allocated writes and
the writeback when bigalloc feature is in use. The race was because we
wanted to determine what blocks in a cluster are under delayed
allocation and we were using buffer_delayed(bh) check for it. But, the
writeback codepath clears this bit without any synchronization which
resulted in a race and an ext4 warning similar to:

EXT4-fs (ram1): ext4_da_update_reserve_space: ino 13, used 1 with only 0
		reserved data blocks

The race existed in two places.
(1) between ext4_find_delalloc_range() and ext4_map_blocks() when called from
    writeback code path.
(2) between ext4_find_delalloc_range() and ext4_da_get_block_prep() (where
    buffer_delayed(bh) is set.

To fix (1), this patch introduces a new buffer_head state bit -
BH_Da_Mapped.  This bit is set under the protection of
EXT4_I(inode)->i_data_sem when we have actually mapped the delayed
allocated blocks during the writeout time. We can now reliably check
for this bit inside ext4_find_delalloc_range() to determine whether
the reservation for the blocks have already been claimed or not.

To fix (2), it was necessary to set buffer_delay(bh) under the
protection of i_data_sem.  So, I extracted the very beginning of
ext4_map_blocks into a new function - ext4_da_map_blocks() - and
performed the required setting of bh_delay bit and the quota
reservation under the protection of i_data_sem.  These two fixes makes
the checking of buffer_delay(bh) and buffer_da_mapped(bh) consistent,
thus removing the race.

Tested: I was able to reproduce the problem by running 'dd' and
'fsync' in parallel. Also, xfstests sometimes used to reproduce this
race. After the fix both my test and xfstests were successful and no
race (warning message) was observed.

Google-Bug-Id: 4997027

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   5 +-
 fs/ext4/extents.c |  38 +++++---------
 fs/ext4/inode.c   | 146 +++++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 134 insertions(+), 55 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 751277a4890c..1bbd2caebe7f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1893,7 +1893,6 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
-extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2300,10 +2299,14 @@ enum ext4_state_bits {
 				 * never, ever appear in a buffer_head's state
 				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
 				 * this is used. */
+	BH_Da_Mapped,	/* Delayed allocated block that now has a mapping. This
+			 * flag is set when ext4_map_blocks is called on a
+			 * delayed allocated block to get its real mapping. */
 };
 
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
 
 /*
  * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9b119308daea..ad39627c1fbc 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3296,28 +3296,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
 
 	while ((i >= lblk_start) && (i <= lblk_end)) {
 		page = find_get_page(mapping, index);
-		if (!page || !PageDirty(page))
+		if (!page)
 			goto nextpage;
 
-		if (PageWriteback(page)) {
-			/*
-			 * This might be a race with allocation and writeout. In
-			 * this case we just assume that the rest of the range
-			 * will eventually be written and there wont be any
-			 * delalloc blocks left.
-			 * TODO: the above assumption is troublesome, but might
-			 * work better in practice. other option could be note
-			 * somewhere that the cluster is getting written out and
-			 * detect that here.
-			 */
-			page_cache_release(page);
-			trace_ext4_find_delalloc_range(inode,
-					lblk_start, lblk_end,
-					search_hint_reverse,
-					0, i);
-			return 0;
-		}
-
 		if (!page_has_buffers(page))
 			goto nextpage;
 
@@ -3340,7 +3321,11 @@ static int ext4_find_delalloc_range(struct inode *inode,
 				continue;
 			}
 
-			if (buffer_delay(bh)) {
+			/* Check if the buffer is delayed allocated and that it
+			 * is not yet mapped. (when da-buffers are mapped during
+			 * their writeout, their da_mapped bit is set.)
+			 */
+			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
 				page_cache_release(page);
 				trace_ext4_find_delalloc_range(inode,
 						lblk_start, lblk_end,
@@ -4106,6 +4091,7 @@ got_allocated_blocks:
 			ext4_da_update_reserve_space(inode, allocated_clusters,
 							1);
 			if (reserved_clusters < allocated_clusters) {
+				struct ext4_inode_info *ei = EXT4_I(inode);
 				int reservation = allocated_clusters -
 						  reserved_clusters;
 				/*
@@ -4148,11 +4134,11 @@ got_allocated_blocks:
 				 *   remaining blocks finally gets written, we
 				 *   could claim them.
 				 */
-				while (reservation) {
-					ext4_da_reserve_space(inode,
-							      map->m_lblk);
-					reservation--;
-				}
+				dquot_reserve_block(inode,
+						EXT4_C2B(sbi, reservation));
+				spin_lock(&ei->i_block_reservation_lock);
+				ei->i_reserved_data_blocks += reservation;
+				spin_unlock(&ei->i_block_reservation_lock);
 			}
 		}
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2dcd4fed96ec..1380cd29c312 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -397,6 +397,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 	return num;
 }
 
+/*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+				   struct ext4_map_blocks *map)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	int i, nr_pages;
+	pgoff_t index, end;
+
+	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (map->m_lblk + map->m_len - 1) >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index,
+					  min(end - index + 1,
+					      (pgoff_t)PAGEVEC_SIZE));
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			if (unlikely(page->mapping != mapping) ||
+			    !PageDirty(page))
+				break;
+
+			if (page_has_buffers(page)) {
+				bh = head = page_buffers(page);
+				do {
+					set_buffer_da_mapped(bh);
+					bh = bh->b_this_page;
+				} while (bh != head);
+			}
+			index++;
+		}
+		pagevec_release(&pvec);
+	}
+}
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -516,9 +559,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
+		/* If we have successfully mapped the delayed allocated blocks,
+		 * set the BH_Da_Mapped bit on them. Its important to do this
+		 * under the protection of i_data_sem.
+		 */
+		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+			set_buffers_da_mapped(inode, map);
+	}
+
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
@@ -1038,7 +1089,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
  * Reserve a single cluster located at lblock
  */
-int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1153,6 +1204,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
+			clear_buffer_da_mapped(bh);
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
@@ -1271,6 +1323,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 						clear_buffer_delay(bh);
 						bh->b_blocknr = pblock;
 					}
+					if (buffer_da_mapped(bh))
+						clear_buffer_da_mapped(bh);
 					if (buffer_unwritten(bh) ||
 					    buffer_mapped(bh))
 						BUG_ON(bh->b_blocknr != pblock);
@@ -1603,6 +1657,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+			      struct ext4_map_blocks *map,
+			      struct buffer_head *bh)
+{
+	int retval;
+	sector_t invalid_block = ~((sector_t) 0xffff);
+
+	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+		invalid_block = ~0;
+
+	map->m_flags = 0;
+	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, map->m_len,
+		  (unsigned long) map->m_lblk);
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	down_read((&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+	else
+		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+
+	if (retval == 0) {
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		/* If the block was allocated from previously allocated cluster,
+		 * then we dont need to reserve it again. */
+		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+			retval = ext4_da_reserve_space(inode, iblock);
+			if (retval)
+				/* not enough space to reserve */
+				goto out_unlock;
+		}
+
+		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+		 * and it should not appear on the bh->b_state.
+		 */
+		map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+	}
+
+out_unlock:
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	return retval;
+}
+
 /*
  * This is a special get_blocks_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
@@ -1620,10 +1734,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 {
 	struct ext4_map_blocks map;
 	int ret = 0;
-	sector_t invalid_block = ~((sector_t) 0xffff);
-
-	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
-		invalid_block = ~0;
 
 	BUG_ON(create == 0);
 	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1636,29 +1746,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_map_blocks(NULL, inode, &map, 0);
-	if (ret < 0)
+	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+	if (ret <= 0)
 		return ret;
-	if (ret == 0) {
-		if (buffer_delay(bh))
-			return 0; /* Not sure this could or should happen */
-		/*
-		 * XXX: __block_write_begin() unmaps passed block, is it OK?
-		 */
-		/* If the block was allocated from previously allocated cluster,
-		 * then we dont need to reserve it again. */
-		if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
-			ret = ext4_da_reserve_space(inode, iblock);
-			if (ret)
-				/* not enough space to reserve */
-				return ret;
-		}
-
-		map_bh(bh, inode->i_sb, invalid_block);
-		set_buffer_new(bh);
-		set_buffer_delay(bh);
-		return 0;
-	}
 
 	map_bh(bh, inode->i_sb, map.m_pblk);
 	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-- 
cgit v1.2.3


From df3ab17072c31fbd394614711772682f0a956a2c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sat, 8 Oct 2011 15:53:49 -0400
Subject: ext4: fix the comment describing ext4_ext_search_right()

The comment describing what ext4_ext_search_right() does is incorrect.
We return 0 in *phys when *logical is the 'largest' allocated block,
not smallest.

Fix a few other typos while we're at it.

Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ext4/extents.c | 4 ++--
 fs/ext4/inode.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ad39627c1fbc..142e3443476f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1263,7 +1263,7 @@ static int ext4_ext_search_left(struct inode *inode,
 /*
  * search the closest allocated block to the right for *logical
  * and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
@@ -2168,7 +2168,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 			 *  need to account for leaf block credit
 			 *
 			 *  bitmaps and block group descriptor blocks
-			 *  and other metadat blocks still need to be
+			 *  and other metadata blocks still need to be
 			 *  accounted.
 			 */
 			/* 1 bitmap, 1 block group descriptor */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1380cd29c312..4863238c3754 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -458,7 +458,7 @@ static void set_buffers_da_mapped(struct inode *inode,
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
@@ -497,7 +497,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
-	 * ext4_ext_get_block() returns th create = 0
+	 * ext4_ext_get_block() returns the create = 0
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-- 
cgit v1.2.3


From 6ee3b2122431fc75015c6114d4749de76422452b Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sat, 8 Oct 2011 16:08:34 -0400
Subject: ext4: use le32_to_cpu for ext4_extent_idx.ei_block in
 ext4_ext_search_left()

ext4_extent_idx.e_block is __le32, so use le32_to_cpu() in
ext4_ext_search_left().

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 142e3443476f..f473ddf0bd94 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1238,9 +1238,9 @@ static int ext4_ext_search_left(struct inode *inode,
 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
 				EXT4_ERROR_INODE(inode,
 				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
-				  ix != NULL ? ix->ei_block : 0,
+				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
-				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
 				  depth);
 				return -EIO;
 			}
-- 
cgit v1.2.3


From f472e02669073e4f1a388142bafa0f806fae841c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 17 Oct 2011 10:13:46 -0400
Subject: ext4: avoid stamping on other memories in ext4_ext_insert_index()

Add a sanity check to make sure ix hasn't gone beyond the valid bounds
of the extent block.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f473ddf0bd94..5c4861210d4c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -779,6 +779,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		ix = curp->p_idx;
 	}
 
+	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+		return -EIO;
+	}
+
 	ix->ei_block = cpu_to_le32(logical);
 	ext4_idx_store_pblock(ix, ptr);
 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
-- 
cgit v1.2.3


From ee90d57e20bd4749dda3b14397392b18b89dc3ef Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 18 Oct 2011 11:01:51 -0400
Subject: ext4: quiet sparse noise about plain integer as NULL pointer

The third parameter to ext4_free_blocks is a struct buffer_head *.  This
parameter should be NULL not 0.

This quiets the sparse noise:

warning: Using plain integer as NULL pointer

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5c4861210d4c..2fc6cc0a51ca 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2291,7 +2291,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		start = ext4_ext_pblock(ex);
 
 		ext_debug("free first %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, 0, start, num, flags);
+		ext4_free_blocks(handle, inode, NULL, start, num, flags);
 
 	} else {
 		printk(KERN_INFO "strange request: removal(2) "
-- 
cgit v1.2.3


From 1939dd84b3f52e9c8d1b46dffae2058f16a3ff6a Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 22 Oct 2011 01:26:05 -0400
Subject: ext4: cleanup ext4_ext_grow_indepth code

Currently code make an impression what grow procedure is very complicated
and some mythical paths, blocks are involved. But in fact grow in depth
it relatively simple procedure:
 1) Just create new meta block and copy root data to that block.
 2) Convert root from extent to index if old depth == 0
 3) Update root block pointer

This patch does:
 - Reorganize code to make it more self explanatory
 - Do not pass path parameter to new_meta_block() in order to
   provoke allocation from inode's group because top-level block
   should site closer to it's inode, but not to leaf data block.

   [ This happens anyway, due to logic in mballoc; we should drop
     the path parameter from new_meta_block() entirely.  -- tytso ]

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2fc6cc0a51ca..385ebb435332 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1050,16 +1050,14 @@ cleanup:
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 				 unsigned int flags,
-				 struct ext4_ext_path *path,
 				 struct ext4_extent *newext)
 {
-	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
 	struct buffer_head *bh;
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_meta_block(handle, inode, path,
+	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
 		newext, &err, flags);
 	if (newblock == 0)
 		return err;
@@ -1079,7 +1077,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	}
 
 	/* move top-level index/leaf into new block */
-	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+	memmove(bh->b_data, EXT4_I(inode)->i_data,
+		sizeof(EXT4_I(inode)->i_data));
 
 	/* set size of new block */
 	neh = ext_block_hdr(bh);
@@ -1097,32 +1096,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	if (err)
 		goto out;
 
-	/* create index in new top-level index: num,max,pointer */
-	err = ext4_ext_get_access(handle, inode, curp);
-	if (err)
-		goto out;
-
-	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
-	curp->p_hdr->eh_entries = cpu_to_le16(1);
-	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
-	if (path[0].p_hdr->eh_depth)
-		curp->p_idx->ei_block =
-			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
-	else
-		curp->p_idx->ei_block =
-			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
-	ext4_idx_store_pblock(curp->p_idx, newblock);
-
+	/* Update top-level index: num,max,pointer */
 	neh = ext_inode_hdr(inode);
+	neh->eh_entries = cpu_to_le16(1);
+	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+	if (neh->eh_depth == 0) {
+		/* Root extent block becomes index block */
+		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+		EXT_FIRST_INDEX(neh)->ei_block =
+			EXT_FIRST_EXTENT(neh)->ee_block;
+	}
 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
-	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
-	err = ext4_ext_dirty(handle, inode, curp);
+	neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
+	ext4_mark_inode_dirty(handle, inode);
 out:
 	brelse(bh);
 
@@ -1170,8 +1160,7 @@ repeat:
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, flags,
-					    path, newext);
+		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
 		if (err)
 			goto out;
 
-- 
cgit v1.2.3


From 750c9c47a5f9daa88333ac435a1afe4d4b428230 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 25 Oct 2011 05:35:05 -0400
Subject: ext4: remove messy logic from ext4_ext_rm_leaf

- Both callers(truncate and punch_hole) already aligned left end point
  so we no longer need split logic here.
- Remove dead duplicated code.
- Call ext4_ext_dirty only after we have updated eh_entries, otherwise
  we'll loose entries update. Regression caused by d583fb87a3ff0
  266'th testcase in xfstests (http://patchwork.ozlabs.org/patch/120872)

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 103 +++++++-----------------------------------------------
 1 file changed, 12 insertions(+), 91 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 385ebb435332..d0e3f333d3f0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2311,13 +2311,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t a, b, block;
+	ext4_lblk_t a, b;
 	unsigned num;
 	ext4_lblk_t ex_ee_block;
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
-	struct ext4_map_blocks map;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf\n", start);
@@ -2360,86 +2359,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			ex_ee_block = le32_to_cpu(ex->ee_block);
 			ex_ee_len = ext4_ext_get_actual_len(ex);
 			continue;
-		} else if (a != ex_ee_block &&
-			b != ex_ee_block + ex_ee_len - 1) {
-			/*
-			 * If this is a truncate, then this condition should
-			 * never happen because at least one of the end points
-			 * needs to be on the edge of the extent.
-			 */
-			if (end == EXT_MAX_BLOCKS - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-						start, end);
-				block = 0;
-				num = 0;
-				err = -EIO;
-				goto out;
-			}
-			/*
-			 * else this is a hole punch, so the extent needs to
-			 * be split since neither edge of the hole is on the
-			 * extent edge
-			 */
-			else{
-				map.m_pblk = ext4_ext_pblock(ex);
-				map.m_lblk = ex_ee_block;
-				map.m_len = b - ex_ee_block;
-
-				err = ext4_split_extent(handle,
-					inode, path, &map, 0,
-					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-					EXT4_GET_BLOCKS_PRE_IO);
-
-				if (err < 0)
-					goto out;
-
-				ex_ee_len = ext4_ext_get_actual_len(ex);
-
-				b = ex_ee_block+ex_ee_len - 1 < end ?
-					ex_ee_block+ex_ee_len - 1 : end;
-
-				/* Then remove tail of this extent */
-				block = ex_ee_block;
-				num = a - block;
-			}
+		} else if (b != ex_ee_block + ex_ee_len - 1) {
+			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
+					 start, end);
+			err = -EIO;
+			goto out;
 		} else if (a != ex_ee_block) {
 			/* remove tail of the extent */
-			block = ex_ee_block;
-			num = a - block;
-		} else if (b != ex_ee_block + ex_ee_len - 1) {
-			/* remove head of the extent */
-			block = b;
-			num =  ex_ee_block + ex_ee_len - b;
-
-			/*
-			 * If this is a truncate, this condition
-			 * should never happen
-			 */
-			if (end == EXT_MAX_BLOCKS - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
+			num = a - ex_ee_block;
 		} else {
 			/* remove whole extent: excellent! */
-			block = ex_ee_block;
 			num = 0;
-			if (a != ex_ee_block) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
-
-			if (b != ex_ee_block + ex_ee_len - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
 		}
-
 		/*
 		 * 3 for leaf, sb, and inode plus 2 (bmap and group
 		 * descriptor) for each block group; assume two block
@@ -2466,19 +2397,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		if (num == 0) {
+		if (num == 0)
 			/* this extent is removed; mark slot entirely unused */
 			ext4_ext_store_pblock(ex, 0);
-		} else if (block != ex_ee_block) {
-			/*
-			 * If this was a head removal, then we need to update
-			 * the physical block since it is now at a different
-			 * location
-			 */
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
-		}
 
-		ex->ee_block = cpu_to_le32(block);
 		ex->ee_len = cpu_to_le16(num);
 		/*
 		 * Do not mark uninitialized if all the blocks in the
@@ -2486,11 +2408,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		 */
 		if (uninitialized && num)
 			ext4_ext_mark_uninitialized(ex);
-
-		err = ext4_ext_dirty(handle, inode, path + depth);
-		if (err)
-			goto out;
-
 		/*
 		 * If the extent was completely released,
 		 * we need to remove it from the leaf
@@ -2513,6 +2430,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		} else
 			*partial_cluster = 0;
 
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
 				ext4_ext_pblock(ex));
 		ex--;
-- 
cgit v1.2.3


From a4e5d88b1b24827f4f6a3cba43228936a67d81ba Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 25 Oct 2011 08:15:12 -0400
Subject: ext4: update EOFBLOCKS flag on fallocate properly

EOFBLOCK_FL should be updated if called w/o FALLOCATE_FL_KEEP_SIZE
Currently it happens only if new extent was allocated.

TESTCASE:
fallocate test_file -n -l4096
fallocate test_file -l4096
Last fallocate cmd has updated size, but keept EOFBLOCK_FL set. And
fsck will complain about that.

Also remove ping pong in ext4_fallocate() in case of new extents,
where ext4_ext_map_blocks() clear EOFBLOCKS bit, and later
ext4_falloc_update_inode() restore it again.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/extents.c | 28 +++++++++++++++++-----------
 fs/ext4/inode.c   |  6 ++++--
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4546da4f26c4..3647ae0b21ab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -539,6 +539,8 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
 	/* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
+	/* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
 
 /*
  * Flags used by ext4_free_blocks
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d0e3f333d3f0..8686eb756b38 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3432,14 +3432,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* buffered write, writepage time, convert*/
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-	if (ret >= 0) {
+	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
-		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
-					 map->m_len);
-		if (err < 0)
-			goto out2;
-	}
-
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3480,6 +3474,12 @@ out:
 
 map_out:
 	map->m_flags |= EXT4_MAP_MAPPED;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+					 map->m_len);
+		if (err < 0)
+			goto out2;
+	}
 out1:
 	if (allocated > map->m_len)
 		allocated = map->m_len;
@@ -3955,7 +3955,10 @@ got_allocated_blocks:
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+	err = 0;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+		err = check_eofblocks_fl(handle, inode, map->m_lblk,
+					 path, ar.len);
 	if (!err)
 		err = ext4_ext_insert_extent(handle, inode, path,
 					     &newex, flags);
@@ -4214,6 +4217,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
+	int flags;
 	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
@@ -4250,6 +4254,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 		return ret;
 	}
+	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+		EXT4_GET_BLOCKS_NO_NORMALIZE;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk = map.m_lblk + ret;
@@ -4259,9 +4267,7 @@ retry:
 			ret = PTR_ERR(handle);
 			break;
 		}
-		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-				      EXT4_GET_BLOCKS_NO_NORMALIZE);
+		ret = ext4_map_blocks(handle, inode, &map, flags);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ff6aace0bb3c..87ec615f0fd6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -477,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-		retval = ext4_ext_map_blocks(handle, inode, map, 0);
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	} else {
-		retval = ext4_ind_map_blocks(handle, inode, map, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
-- 
cgit v1.2.3


From f85b287a01237857a50c93868231f7e831581a27 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 26 Oct 2011 03:42:36 -0400
Subject: ext4: error handling fix in ext4_ext_convert_to_initialized()

When allocated is unsigned it breaks the error handling at the end
of the function when we call:
	allocated = ext4_split_extent(...);
	if (allocated < 0)
		err = allocated;

I've made it a signed int instead of unsigned.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8686eb756b38..02a4d80573a1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2928,7 +2928,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
-	unsigned int allocated, ee_len, depth;
+	unsigned int ee_len, depth;
+	int allocated;
 	int err = 0;
 	int split_flag = 0;
 
-- 
cgit v1.2.3


From 6f8ff537266ee5396c920fb0c842a21df3055ff3 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Wed, 26 Oct 2011 04:38:59 -0400
Subject: ext4: handle NULL p_ext in ext4_ext_next_allocated_block()

In ext4_ext_next_allocated_block(), the path[depth] might
have a p_ext that is NULL -- see ext4_ext_binsearch().  In
such a case, dereferencing it will crash the machine.

This patch checks for p_ext == NULL in
ext4_ext_next_allocated_block() before dereferencinging it.

Tested using a hand-crafted an inode with eh_entries == 0 in
an extent block, verified that running FIEMAP on it crashes
without this patch, works fine with it.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 02a4d80573a1..797b63b59740 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1392,7 +1392,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 	while (depth >= 0) {
 		if (depth == path->p_depth) {
 			/* leaf */
-			if (path[depth].p_ext !=
+			if (path[depth].p_ext &&
+				path[depth].p_ext !=
 					EXT_LAST_EXTENT(path[depth].p_hdr))
 			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
 		} else {
-- 
cgit v1.2.3


From b3ff05690845911cc40387176f0bc5a7af9ef3ff Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 26 Oct 2011 11:08:39 -0400
Subject: ext4: don't check io->flag when setting EXT4_STATE_DIO_UNWRITTEN
 inode state

When we want to convert the unitialized extent in direct write, we can
either do it in ext4_end_io_nolock(AIO case) or in
ext4_ext_direct_IO(non AIO case) and EXT4_I(inode)->cur_aio_dio is a
guard for ext4_ext_map_blocks to find the right case.  In e9e3bcecf,
we mistakenly change it by:

-			if (io)
+			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
 				io->flag = EXT4_IO_END_UNWRITTEN;
-			else
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			} else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);

So now if we map 2 blocks, and the first one set the
EXT_IO_END_UNWRITTEN, the 2nd mapping will set inode state because of
the check for the flag. This is wrong.

Cc: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 797b63b59740..c2ac06cb2d46 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3390,9 +3390,11 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * that this IO needs to conversion to written when IO is
 		 * completed
 		 */
-		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-			io->flag = EXT4_IO_END_UNWRITTEN;
-			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+		if (io) {
+			if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
+				io->flag = EXT4_IO_END_UNWRITTEN;
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			}
 		} else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
@@ -3946,9 +3948,11 @@ got_allocated_blocks:
 		 * that we need to perform conversion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-				io->flag = EXT4_IO_END_UNWRITTEN;
-				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			if (io) {
+				if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
+					io->flag = EXT4_IO_END_UNWRITTEN;
+					atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+				}
 			} else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
-- 
cgit v1.2.3


From 6f91bc5fda82d2c49b4f7fb29424cf6a3c7574bc Mon Sep 17 00:00:00 2001
From: Eric Gouriou <egouriou@google.com>
Date: Thu, 27 Oct 2011 11:43:23 -0400
Subject: ext4: optimize ext4_ext_convert_to_initialized()

This patch introduces a fast path in ext4_ext_convert_to_initialized()
for the case when the conversion can be performed by transferring
the newly initialized blocks from the uninitialized extent into
an adjacent initialized extent. Doing so removes the expensive
invocations of memmove() which occur during extent insertion and
the subsequent merge.

In practice this should be the common case for clients performing
append writes into files pre-allocated via
fallocate(FALLOC_FL_KEEP_SIZE). In such a workload performed via
direct IO and when using a suboptimal implementation of memmove()
(x86_64 prior to the 2.6.39 rewrite), this patch reduces kernel CPU
consumption by 32%.

Two new trace points are added to ext4_ext_convert_to_initialized()
to offer visibility into its operations. No exit trace point has
been added due to the multiplicity of return points. This can be
revisited once the upstream cleanup is backported.

Signed-off-by: Eric Gouriou <egouriou@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c           | 93 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c             |  1 +
 include/trace/events/ext4.h | 82 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 176 insertions(+)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c2ac06cb2d46..8b6a17b60970 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2919,12 +2919,23 @@ out:
  *   a> There is no split required: Entire extent should be initialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct inode *inode,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
@@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
 
+	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+	/* Pre-conditions */
+	BUG_ON(!ext4_ext_is_uninitialized(ex));
+	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+	BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+	/*
+	 * Attempt to transfer newly initialized blocks from the currently
+	 * uninitialized extent to its left neighbor. This is much cheaper
+	 * than an insertion followed by a merge as those involve costly
+	 * memmove() calls. This is the common case in steady state for
+	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+	 * writes.
+	 *
+	 * Limitations of the current logic:
+	 *  - L1: we only deal with writes at the start of the extent.
+	 *    The approach could be extended to writes at the end
+	 *    of the extent but this scenario was deemed less common.
+	 *  - L2: we do not deal with writes covering the whole extent.
+	 *    This would require removing the extent if the transfer
+	 *    is possible.
+	 *  - L3: we only attempt to merge with an extent stored in the
+	 *    same extent tree node.
+	 */
+	if ((map->m_lblk == ee_block) &&	/*L1*/
+		(map->m_len < ee_len) &&	/*L2*/
+		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
+		struct ext4_extent *prev_ex;
+		ext4_lblk_t prev_lblk;
+		ext4_fsblk_t prev_pblk, ee_pblk;
+		unsigned int prev_len, write_len;
+
+		prev_ex = ex - 1;
+		prev_lblk = le32_to_cpu(prev_ex->ee_block);
+		prev_len = ext4_ext_get_actual_len(prev_ex);
+		prev_pblk = ext4_ext_pblock(prev_ex);
+		ee_pblk = ext4_ext_pblock(ex);
+		write_len = map->m_len;
+
+		/*
+		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+		 * upon those conditions:
+		 * - C1: prev_ex is initialized,
+		 * - C2: prev_ex is logically abutting ex,
+		 * - C3: prev_ex is physically abutting ex,
+		 * - C4: prev_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
+		 */
+		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
+			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
+			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
+			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, prev_ex);
+
+			/* Shift the start of ex by 'write_len' blocks */
+			ex->ee_block = cpu_to_le32(ee_block + write_len);
+			ext4_ext_store_pblock(ex, ee_pblk + write_len);
+			ex->ee_len = cpu_to_le16(ee_len - write_len);
+			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+			/* Extend prev_ex by 'write_len' blocks */
+			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+			/* Mark the block containing both extents as dirty */
+			ext4_ext_dirty(handle, inode, path + depth);
+
+			/* Update path to point to the right extent */
+			path[depth].p_ext = prev_ex;
+
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = write_len;
+			goto out;
+		}
+	}
+
 	WARN_ON(map->m_lblk < ee_block);
 	/*
 	 * It is safe to convert extent to initialized via explicit
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcc460537bc7..9953d80145ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 
 #include "ext4.h"
+#include "ext4_extents.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c9a341e385a3..748ff7cbe555 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -9,6 +9,7 @@
 
 struct ext4_allocation_context;
 struct ext4_allocation_request;
+struct ext4_extent;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
@@ -1394,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
 	TP_ARGS(inode)
 );
 
+/* 'ux' is the uninitialized extent. */
+TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 struct ext4_extent *ux),
+
+	TP_ARGS(inode, map, ux),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	m_lblk	)
+		__field(	unsigned,	m_len	)
+		__field(	ext4_lblk_t,	u_lblk	)
+		__field(	unsigned,	u_len	)
+		__field(	ext4_fsblk_t,	u_pblk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->m_lblk		= map->m_lblk;
+		__entry->m_len		= map->m_len;
+		__entry->u_lblk		= le32_to_cpu(ux->ee_block);
+		__entry->u_len		= ext4_ext_get_actual_len(ux);
+		__entry->u_pblk		= ext4_ext_pblock(ux);
+	),
+
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
+		  "u_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->m_lblk, __entry->m_len,
+		  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
+);
+
+/*
+ * 'ux' is the uninitialized extent.
+ * 'ix' is the initialized extent to which blocks are transferred.
+ */
+TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 struct ext4_extent *ux, struct ext4_extent *ix),
+
+	TP_ARGS(inode, map, ux, ix),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	m_lblk	)
+		__field(	unsigned,	m_len	)
+		__field(	ext4_lblk_t,	u_lblk	)
+		__field(	unsigned,	u_len	)
+		__field(	ext4_fsblk_t,	u_pblk	)
+		__field(	ext4_lblk_t,	i_lblk	)
+		__field(	unsigned,	i_len	)
+		__field(	ext4_fsblk_t,	i_pblk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->m_lblk		= map->m_lblk;
+		__entry->m_len		= map->m_len;
+		__entry->u_lblk		= le32_to_cpu(ux->ee_block);
+		__entry->u_len		= ext4_ext_get_actual_len(ux);
+		__entry->u_pblk		= ext4_ext_pblock(ux);
+		__entry->i_lblk		= le32_to_cpu(ix->ee_block);
+		__entry->i_len		= ext4_ext_get_actual_len(ix);
+		__entry->i_pblk		= ext4_ext_pblock(ix);
+	),
+
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
+		  "u_lblk %u u_len %u u_pblk %llu "
+		  "i_lblk %u i_len %u i_pblk %llu ",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->m_lblk, __entry->m_len,
+		  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
+		  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
+);
+
 DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 unsigned int len, unsigned int flags),
-- 
cgit v1.2.3


From 80e675f906db54eb1ce3a9555cee5f45b5b72ab2 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <egouriou@google.com>
Date: Thu, 27 Oct 2011 11:52:18 -0400
Subject: ext4: optimize memmmove lengths in extent/index insertions

ext4_ext_insert_extent() (respectively ext4_ext_insert_index())
was using EXT_MAX_EXTENT() (resp. EXT_MAX_INDEX()) to determine
how many entries needed to be moved beyond the insertion point.
In practice this means that (320 - I) * 24 bytes were memmove()'d
when I is the insertion point, rather than (#entries - I) * 24 bytes.

This patch uses EXT_LAST_EXTENT() (resp. EXT_LAST_INDEX()) instead
to only move existing entries. The code flow is also simplified
slightly to highlight similarities and reduce code duplication in
the insertion logic.

This patch reduces system CPU consumption by over 25% on a 4kB
synchronous append DIO write workload when used with the
pre-2.6.39 x86_64 memmove() implementation. With the much faster
2.6.39 memmove() implementation we still see a decrease in
system CPU usage between 2% and 7%.

Note that the ext_debug() output changes with this patch, splitting
some log information between entries. Users of the ext_debug() output
should note that the "move %d" units changed from reporting the number
of bytes moved to reporting the number of entries moved.

Signed-off-by: Eric Gouriou <egouriou@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 85 +++++++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8b6a17b60970..29969622af8b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -754,31 +754,25 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		return -EIO;
 	}
 
-	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 		/* insert after */
-		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
-			len = (len - 1) * sizeof(struct ext4_extent_idx);
-			len = len < 0 ? 0 : len;
-			ext_debug("insert new index %d after: %llu. "
-					"move %d from 0x%p to 0x%p\n",
-					logical, ptr, len,
-					(curp->p_idx + 1), (curp->p_idx + 2));
-			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
-		}
+		ext_debug("insert new index %d after: %llu\n", logical, ptr);
 		ix = curp->p_idx + 1;
 	} else {
 		/* insert before */
-		len = len * sizeof(struct ext4_extent_idx);
-		len = len < 0 ? 0 : len;
-		ext_debug("insert new index %d before: %llu. "
-				"move %d from 0x%p to 0x%p\n",
-				logical, ptr, len,
-				curp->p_idx, (curp->p_idx + 1));
-		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ext_debug("insert new index %d before: %llu\n", logical, ptr);
 		ix = curp->p_idx;
 	}
 
+	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+	BUG_ON(len < 0);
+	if (len > 0) {
+		ext_debug("insert new index %d: "
+				"move %d indices from 0x%p to 0x%p\n",
+				logical, len, ix, ix + 1);
+		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+	}
+
 	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
 		return -EIO;
@@ -1779,41 +1773,46 @@ has_space:
 				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext));
-		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
-	} else if (le32_to_cpu(newext->ee_block)
+		nearex = EXT_FIRST_EXTENT(eh);
+	} else {
+		if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
-/*		BUG_ON(newext->ee_block == nearex->ee_block); */
-		if (nearex != EXT_LAST_EXTENT(eh)) {
-			len = EXT_MAX_EXTENT(eh) - nearex;
-			len = (len - 1) * sizeof(struct ext4_extent);
-			len = len < 0 ? 0 : len;
-			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
-					"move %d from 0x%p to 0x%p\n",
+			/* Insert after */
+			ext_debug("insert %d:%llu:[%d]%d %s before: "
+					"nearest 0x%p\n"
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
+					ext4_ext_get_actual_len(newext),
+					nearex);
+			nearex++;
+		} else {
+			/* Insert before */
+			BUG_ON(newext->ee_block == nearex->ee_block);
+			ext_debug("insert %d:%llu:[%d]%d %s after: "
+					"nearest 0x%p\n"
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
 					ext4_ext_get_actual_len(newext),
-					nearex, len, nearex + 1, nearex + 2);
-			memmove(nearex + 2, nearex + 1, len);
+					nearex);
+		}
+		len = EXT_LAST_EXTENT(eh) - nearex + 1;
+		if (len > 0) {
+			ext_debug("insert %d:%llu:[%d]%d: "
+					"move %d extents from 0x%p to 0x%p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
+					ext4_ext_get_actual_len(newext),
+					len, nearex, nearex + 1);
+			memmove(nearex + 1, nearex,
+				len * sizeof(struct ext4_extent));
 		}
-		path[depth].p_ext = nearex + 1;
-	} else {
-		BUG_ON(newext->ee_block == nearex->ee_block);
-		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
-		len = len < 0 ? 0 : len;
-		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
-				"move %d from 0x%p to 0x%p\n",
-				le32_to_cpu(newext->ee_block),
-				ext4_ext_pblock(newext),
-				ext4_ext_is_uninitialized(newext),
-				ext4_ext_get_actual_len(newext),
-				nearex, len, nearex, nearex + 1);
-		memmove(nearex + 1, nearex, len);
-		path[depth].p_ext = nearex;
 	}
 
 	le16_add_cpu(&eh->eh_entries, 1);
-	nearex = path[depth].p_ext;
+	path[depth].p_ext = nearex;
 	nearex->ee_block = newext->ee_block;
 	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
 	nearex->ee_len = newext->ee_len;
-- 
cgit v1.2.3


From 81fdbb4a8d34242f0ed048395c4ddc910f1dffbe Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 29 Oct 2011 09:23:38 -0400
Subject: ext4: move variables to their scope

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 29969622af8b..05693804813d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 			      struct ext4_ext_path *path,
 			      ext4_lblk_t block)
 {
-	int depth;
-
 	if (path) {
+		int depth = path->p_depth;
 		struct ext4_extent *ex;
-		depth = path->p_depth;
 
 		/*
 		 * Try to predict block placement assuming that we are
@@ -247,7 +245,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int idxs, num = 0;
+	int idxs;
 
 	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 		/ sizeof(struct ext4_extent_idx));
@@ -262,6 +260,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 	 */
 	if (ei->i_da_metadata_calc_len &&
 	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+		int num = 0;
+
 		if ((ei->i_da_metadata_calc_len % idxs) == 0)
 			num++;
 		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -324,8 +324,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
 				struct ext4_extent_header *eh,
 				int depth)
 {
-	struct ext4_extent *ext;
-	struct ext4_extent_idx *ext_idx;
 	unsigned short entries;
 	if (eh->eh_entries == 0)
 		return 1;
@@ -334,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 
 	if (depth == 0) {
 		/* leaf entries */
-		ext = EXT_FIRST_EXTENT(eh);
+		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 		while (entries) {
 			if (!ext4_valid_extent(inode, ext))
 				return 0;
@@ -342,7 +340,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 			entries--;
 		}
 	} else {
-		ext_idx = EXT_FIRST_INDEX(eh);
+		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 		while (entries) {
 			if (!ext4_valid_extent_idx(inode, ext_idx))
 				return 0;
@@ -3722,13 +3720,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock = 0;
 	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
-	unsigned int allocated_clusters = 0, reserved_clusters = 0;
+	unsigned int allocated_clusters = 0;
 	unsigned int punched_out = 0;
 	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 	ext4_lblk_t cluster_offset;
-	struct ext4_map_blocks punch_map;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 		  map->m_lblk, map->m_len, inode->i_ino);
@@ -3804,6 +3801,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			struct ext4_map_blocks punch_map;
 			ext4_fsblk_t partial_cluster = 0;
 
 			newblock = map->m_lblk - ee_block + ee_start;
@@ -4084,8 +4082,9 @@ got_allocated_blocks:
 	 * block allocation which had been deferred till now.
 	 */
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
 		/*
-		 * Check how many clusters we had reserved this allocted range.
+		 * Check how many clusters we had reserved this allocated range
 		 */
 		reserved_clusters = get_reserved_cluster_alloc(inode,
 						map->m_lblk, allocated);
-- 
cgit v1.2.3


From 02dc62fba89eaee0157752c5f1ba811ef3156e00 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 29 Oct 2011 09:29:11 -0400
Subject: ext4: clean up AGGRESSIVE_TEST code

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 05693804813d..652b4dc5dfcb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -181,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 6)
-			size = 6;
+	if (!check && size > 6)
+		size = 6;
 #endif
-	}
 	return size;
 }
 
@@ -196,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent_idx);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 5)
-			size = 5;
+	if (!check && size > 5)
+		size = 5;
 #endif
-	}
 	return size;
 }
 
@@ -212,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 3)
-			size = 3;
+	if (!check && size > 3)
+		size = 3;
 #endif
-	}
 	return size;
 }
 
@@ -228,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent_idx);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 4)
-			size = 4;
+	if (!check && size > 4)
+		size = 4;
 #endif
-	}
 	return size;
 }
 
-- 
cgit v1.2.3


From e7b319e39776bd0e9c0c7855b023dafed2c93d27 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 29 Oct 2011 09:39:51 -0400
Subject: ext4: trace punch_hole correctly in ext4_ext_map_blocks

When ext4_ext_map_blocks() is called by punch_hole, trace should
trace blocks punched out.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 652b4dc5dfcb..36a0f177ecad 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4171,12 +4171,12 @@ out2:
 		ext4_ext_drop_refs(path);
 		kfree(path);
 	}
-	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-		newblock, map->m_len, err ? err : allocated);
-
 	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
 			punched_out : allocated;
 
+	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+		newblock, map->m_len, err ? err : result);
+
 	return err ? err : result;
 }
 
-- 
cgit v1.2.3


From 0edeb71dc9133bfb505d3bf59642e07cd936613e Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 31 Oct 2011 17:30:44 -0400
Subject: ext4: Create helper function for EXT4_IO_END_UNWRITTEN and
 i_aiodio_unwritten

EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten
should be done simultaneously since ext4_end_io_nolock always clear
the flag and decrease the counter in the same time.

We have found some bugs that the flag is set while leaving
i_aiodio_unwritten unchanged(commit 32c80b32c053d). So this patch just tries
to create a helper function to wrap them to avoid any future bug.
The idea is inspired by Eric.

Cc: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  9 +++++++++
 fs/ext4/extents.c | 18 ++++++------------
 fs/ext4/inode.c   |  5 +----
 fs/ext4/page-io.c |  6 ++----
 4 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 657e82649fa5..604c200216c1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1275,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+					      struct ext4_io_end *io_end)
+{
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+	}
+}
+
 /*
  * Inode dynamic state flags
  */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 36a0f177ecad..a5c8caaaa099 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3472,12 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * that this IO needs to conversion to written when IO is
 		 * completed
 		 */
-		if (io) {
-			if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
-				io->flag = EXT4_IO_END_UNWRITTEN;
-				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-			}
-		} else
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
 			map->m_flags |= EXT4_MAP_UNINIT;
@@ -4030,12 +4027,9 @@ got_allocated_blocks:
 		 * that we need to perform conversion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io) {
-				if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
-					io->flag = EXT4_IO_END_UNWRITTEN;
-					atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-				}
-			} else
+			if (io)
+				ext4_set_io_unwritten_flag(inode, io);
+			else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e4b26faac5ff..60af5126eb05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2831,10 +2831,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 	 * but being more careful is always safe for the future change.
 	 */
 	inode = io_end->inode;
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		io_end->flag |= EXT4_IO_END_UNWRITTEN;
-		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-	}
+	ext4_set_io_unwritten_flag(inode, io_end);
 
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9eebf44646f6..7ce1d0b19c94 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -336,10 +336,8 @@ submit_and_retry:
 	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
 	    (io_end->pages[io_end->num_io_pages-1] != io_page))
 		goto submit_and_retry;
-	if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		io_end->flag |= EXT4_IO_END_UNWRITTEN;
-		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-	}
+	if (buffer_uninit(bh))
+		ext4_set_io_unwritten_flag(inode, io_end);
 	io->io_end->size += bh->b_size;
 	io->io_next_block++;
 	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
-- 
cgit v1.2.3


From 3c6fe77017bc6ce489f231c35fed3220b6691836 Mon Sep 17 00:00:00 2001
From: Greg Harm <gharm@google.com>
Date: Mon, 31 Oct 2011 18:41:47 -0400
Subject: ext4: Don't normalize an falloc request if it can fit in 1 extent.

If an fallocate request fits in EXT_UNINIT_MAX_LEN, then set the
EXT4_GET_BLOCKS_NO_NORMALIZE flag. For larger fallocate requests,
let mballoc.c normalize the request.

This fixes a problem where large requests were being split into
non-contiguous extents due to commit 556b27abf73: ext4: do not
normalize block requests from fallocate.

Testing:
*) Checked that 8.x MB falloc'ed files are still laid down next to
each other (contiguously).
*) Checked that the maximum size extent (127.9MB) is allocated as 1
extent.
*) Checked that a 1GB file is somewhat contiguous (often 5-6
non-contiguous extents now).
*) Checked that a 120MB file can still be falloc'ed even if there are
no single extents large enough to hold it.

Signed-off-by: Greg Harm <gharm@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a5c8caaaa099..2798945a1920 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4337,10 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 		return ret;
 	}
-	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-		EXT4_GET_BLOCKS_NO_NORMALIZE;
+	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+	/*
+	 * Don't normalize the request if it can fit in one extent so
+	 * that it doesn't get unnecessarily split into multiple
+	 * extents.
+	 */
+	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk = map.m_lblk + ret;
-- 
cgit v1.2.3


From 32de67569059d22b02dd9323a40220d953642b7e Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 1 Nov 2011 18:56:41 -0400
Subject: ext4: fix a syntax error in ext4_ext_insert_extent when debugging
 enabled

This patch fixes a syntax error which omits a comma. Besides this,
logical block number is unsigend 32 bits, so printk should use %u
instead %d.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2798945a1920..8cacaf645e36 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1682,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	/* try to insert block into found extent and return */
 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
-		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+		ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
 			  ext4_ext_is_uninitialized(newext),
 			  ext4_ext_get_actual_len(newext),
 			  le32_to_cpu(ex->ee_block),
@@ -1720,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
 		next = ext4_ext_next_leaf_block(path);
 	if (next != EXT_MAX_BLOCKS) {
-		ext_debug("next leaf block - %d\n", next);
+		ext_debug("next leaf block - %u\n", next);
 		BUG_ON(npath != NULL);
 		npath = ext4_ext_find_extent(inode, next, NULL);
 		if (IS_ERR(npath))
@@ -1758,7 +1758,7 @@ has_space:
 
 	if (!nearex) {
 		/* there is no extent in this leaf, create first one */
-		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
 				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
@@ -1768,8 +1768,8 @@ has_space:
 		if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
 			/* Insert after */
-			ext_debug("insert %d:%llu:[%d]%d %s before: "
-					"nearest 0x%p\n"
+			ext_debug("insert %u:%llu:[%d]%d before: "
+					"nearest %p\n",
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
@@ -1779,8 +1779,8 @@ has_space:
 		} else {
 			/* Insert before */
 			BUG_ON(newext->ee_block == nearex->ee_block);
-			ext_debug("insert %d:%llu:[%d]%d %s after: "
-					"nearest 0x%p\n"
+			ext_debug("insert %u:%llu:[%d]%d after: "
+					"nearest %p\n",
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
@@ -1789,7 +1789,7 @@ has_space:
 		}
 		len = EXT_LAST_EXTENT(eh) - nearex + 1;
 		if (len > 0) {
-			ext_debug("insert %d:%llu:[%d]%d: "
+			ext_debug("insert %u:%llu:[%d]%d: "
 					"move %d extents from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
-- 
cgit v1.2.3


From bf52c6f7af55c48ab0fd5f990460b884b428d906 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 1 Nov 2011 18:59:26 -0400
Subject: ext4: let ext4_ext_rm_leaf work with EXT_DEBUG defined

The variable 'block' is removed by commit 750c9c47, so use the
replacement ex_ee_block instead.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8cacaf645e36..61fa9e1614af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2424,7 +2424,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		ext_debug("new extent: %u:%u:%llu\n", block, num,
+		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
 				ext4_ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
-- 
cgit v1.2.3