From 92859a5efdfa71f712ec1d213f43061965d3e9b4 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 26 Jun 2015 17:28:55 +0200
Subject: f2fs crypto: delete an unnecessary check before the function call
 "key_put"

The key_put() function tests whether its argument is NULL and then
returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/crypto_key.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
index 95b8f936f00b..9f77de2ef317 100644
--- a/fs/f2fs/crypto_key.c
+++ b/fs/f2fs/crypto_key.c
@@ -92,8 +92,7 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
 	if (!ci)
 		return;
 
-	if (ci->ci_keyring_key)
-		key_put(ci->ci_keyring_key);
+	key_put(ci->ci_keyring_key);
 	crypto_free_ablkcipher(ci->ci_ctfm);
 	kmem_cache_free(f2fs_crypt_info_cachep, ci);
 }
-- 
cgit v1.2.3


From 5ac9f36fcacd532b218db1e0fd0f9e8a18321f22 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 29 Jun 2015 18:14:10 +0800
Subject: f2fs: fix to record dirty page count for symlink

Dirty page can be exist in mapping of newly created symlink, but previously
we did not maintain the counting of dirty page for symlink like we maintained
for regular/directory, so the counting we lookuped should be wrong.

This patch adds missed dirty page counting for symlink to fix this issue.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 3 ++-
 fs/f2fs/f2fs.h       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b70bbe1a6a8c..de7a0d6a371a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -704,7 +704,8 @@ void update_dirty_page(struct inode *inode, struct page *page)
 	struct inode_entry *new;
 	int ret = 0;
 
-	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+			!S_ISLNK(inode->i_mode))
 		return;
 
 	if (!S_ISDIR(inode->i_mode)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a8327ed73898..516220454a4e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1039,7 +1039,8 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
 
 static inline void inode_dec_dirty_pages(struct inode *inode)
 {
-	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+			!S_ISLNK(inode->i_mode))
 		return;
 
 	atomic_dec(&F2FS_I(inode)->dirty_pages);
-- 
cgit v1.2.3


From eca616f8c1d6c581f3785f0ee3e2a3887e084273 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 15 Jun 2015 14:52:29 -0700
Subject: f2fs: avoid freed stat information

The write_checkpoint can update stat information, so we should destroy the stat
structure after it.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a06b0b46fe69..da277100dc90 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -498,7 +498,6 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 	kobject_del(&sbi->s_kobj);
 
-	f2fs_destroy_stats(sbi);
 	stop_gc_thread(sbi);
 
 	/*
@@ -514,6 +513,9 @@ static void f2fs_put_super(struct super_block *sb)
 		write_checkpoint(sbi, &cpc);
 	}
 
+	/* write_checkpoint can update stat informaion */
+	f2fs_destroy_stats(sbi);
+
 	/*
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
-- 
cgit v1.2.3


From c9b63bd01dd8da096d079c490771ad8a049fd480 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 23 Jun 2015 10:36:08 -0700
Subject: f2fs: avoid to use failed inode immediately

Before iput is called, the inode number used by a bad inode can be reassigned
to other new inode, resulting in any abnormal behaviors on the new inode.
This should not happen for the new inode.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/inode.c | 19 ++++++++++++-------
 fs/f2fs/namei.c |  4 ++--
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 516220454a4e..3aaa4b99050a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1343,6 +1343,7 @@ enum {
 	FI_INC_LINK,		/* need to increment i_nlink */
 	FI_ACL_MODE,		/* indicate acl mode */
 	FI_NO_ALLOC,		/* should not allocate any blocks */
+	FI_FREE_NID,		/* free allocated nide */
 	FI_UPDATE_DIR,		/* should update inode block for consistency */
 	FI_DELAY_IPUT,		/* used for the recovery */
 	FI_NO_EXTENT,		/* not to use the extent cache */
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2550868dc651..757fed253697 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -314,7 +314,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 void f2fs_evict_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	nid_t xnid = fi->i_xattr_nid;
 
 	/* some remained atomic pages should discarded */
 	if (f2fs_is_atomic_file(inode))
@@ -334,7 +335,7 @@ void f2fs_evict_inode(struct inode *inode)
 		goto no_delete;
 
 	sb_start_intwrite(inode->i_sb);
-	set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+	set_inode_flag(fi, FI_NO_ALLOC);
 	i_size_write(inode, 0);
 
 	if (F2FS_HAS_BLOCKS(inode))
@@ -357,14 +358,18 @@ no_delete:
 	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
-	if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE))
+	if (is_inode_flag_set(fi, FI_APPEND_WRITE))
 		add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
-	if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
+	if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
 		add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+	if (is_inode_flag_set(fi, FI_FREE_NID)) {
+		alloc_nid_failed(sbi, inode->i_ino);
+		clear_inode_flag(fi, FI_FREE_NID);
+	}
 out_clear:
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
-	if (F2FS_I(inode)->i_crypt_info)
-		f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info);
+	if (fi->i_crypt_info)
+		f2fs_free_encryption_info(inode, fi->i_crypt_info);
 #endif
 	clear_inode(inode);
 }
@@ -384,9 +389,9 @@ void handle_failed_inode(struct inode *inode)
 
 	remove_inode_page(inode);
 
+	set_inode_flag(F2FS_I(inode), FI_FREE_NID);
 	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
 	clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
-	alloc_nid_failed(sbi, inode->i_ino);
 	f2fs_unlock_op(sbi);
 
 	/* iput will drop the inode object */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index fdbae21ee8fb..08656fca8f83 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -78,9 +78,9 @@ out:
 fail:
 	trace_f2fs_new_inode(inode, err);
 	make_bad_inode(inode);
-	iput(inode);
 	if (nid_free)
-		alloc_nid_failed(sbi, ino);
+		set_inode_flag(F2FS_I(inode), FI_FREE_NID);
+	iput(inode);
 	return ERR_PTR(err);
 }
 
-- 
cgit v1.2.3


From 97a7b2c274d5dbe51170e099c16d49cfd1b467af Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 17 Jun 2015 13:59:05 -0700
Subject: f2fs: convert inline_data for various fallocate

For newly added fallocate types, it should convert inline_data before handling
block swapping.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b0f38c3b37f4..fe8398f1d627 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -885,6 +885,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
 		return -EINVAL;
 
+	f2fs_balance_fs(F2FS_I_SB(inode));
+
+	if (f2fs_has_inline_data(inode)) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+	}
+
 	pg_start = offset >> PAGE_CACHE_SHIFT;
 	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
 
@@ -1033,6 +1041,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	f2fs_balance_fs(sbi);
 
+	if (f2fs_has_inline_data(inode)) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+	}
+
 	ret = truncate_blocks(inode, i_size_read(inode), true);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 7a2cb67867b9a7f28a7c4d0fadd2f337a6d46ff7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 18 Jun 2015 14:17:04 -0700
Subject: f2fs: fix wrong block address calculation for a split extent

This patch fixes wrong calculation on block address field when an extent is
split.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f71e19a9dd3c..d1d86d53d1dc 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -674,7 +674,7 @@ static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 		endofs = dei.fofs + dei.len - 1;
 		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
 			set_extent_info(&ei, fofs + 1,
-				fofs - dei.fofs + dei.blk, endofs - fofs);
+				fofs - dei.fofs + dei.blk + 1, endofs - fofs);
 			en2 = __insert_extent_tree(sbi, et, &ei, NULL);
 		}
 	}
-- 
cgit v1.2.3


From cbe91923a97c96d6a931f4b5b7e32083218a0251 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 16 Jun 2015 15:17:01 -0700
Subject: f2fs: update on-disk extents even under extent_cache

Previously, f2fs_update_extent_cache() updates in-memory extent_cache all the
time, and then finally preserves its up-to-date extent into on-disk one during
f2fs_evict_inode.

But, in the following scenario:

1. mount
2. open & write an extent X
3. f2fs_evict_inode; on-disk extent is X
4. open & update the extent X with Y
5. sync; trigger checkpoint
6. power-cut

after power-on, f2fs should serve extent Y, but we have an on-disk extent X.

This causes a failure on xfstests/311.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d1d86d53d1dc..176e4ad4e5ed 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -899,9 +899,9 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
 							dn->ofs_in_node;
 
+	/* we should call update_extent_info() to update on-disk extent */
 	if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
-		return f2fs_update_extent_tree(dn->inode, fofs,
-							dn->data_blkaddr);
+		f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr);
 
 	if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
 		sync_inode_page(dn);
-- 
cgit v1.2.3


From 244f4fc1c530c4e486f0e4f0909c0514e4539ba2 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 22 Jun 2015 18:22:38 -0700
Subject: f2fs: set cached_en after checking finally

This patch relocates cached_en not only to be covered by spin_lock, but also
to set once after checking out completely.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 176e4ad4e5ed..982a1a58efd7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -453,14 +453,12 @@ static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
 	while (node) {
 		en = rb_entry(node, struct extent_node, rb_node);
 
-		if (fofs < en->ei.fofs) {
+		if (fofs < en->ei.fofs)
 			node = node->rb_left;
-		} else if (fofs >= en->ei.fofs + en->ei.len) {
+		else if (fofs >= en->ei.fofs + en->ei.len)
 			node = node->rb_right;
-		} else {
-			et->cached_en = en;
+		else
 			return en;
-		}
 	}
 	return NULL;
 }
@@ -625,6 +623,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		spin_lock(&sbi->extent_lock);
 		if (!list_empty(&en->list))
 			list_move_tail(&en->list, &sbi->extent_list);
+		et->cached_en = en;
 		spin_unlock(&sbi->extent_lock);
 		stat_inc_read_hit(sbi->sb);
 	}
-- 
cgit v1.2.3


From 2658e50de61429f57d9496bfe371f232e2d039a1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 19 Jun 2015 12:01:21 -0700
Subject: f2fs: introduce a shrinker for mounted fs

This patch introduces a shrinker targeting to reduce memory footprint consumed
by a number of in-memory f2fs data structures.

In addition, it newly adds:
 - sbi->umount_mutex to avoid data races on shrinker and put_super
 - sbi->shruinker_run_no to not revisit objects

Note that the basic implementation was copied from fs/ubifs/shrinker.c

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/Makefile   |   1 +
 fs/f2fs/f2fs.h     |  13 +++++++
 fs/f2fs/shrinker.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/f2fs/super.c    |  31 +++++++++++++++-
 4 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 fs/f2fs/shrinker.c

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 396be1a39e55..005251b8d459 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
 
 f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-y		+= shrinker.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3aaa4b99050a..e82af8c7ee8c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -791,6 +791,11 @@ struct f2fs_sb_info {
 	/* For sysfs suppport */
 	struct kobject s_kobj;
 	struct completion s_kobj_unregister;
+
+	/* For shrinker support */
+	struct list_head s_list;
+	struct mutex umount_mutex;
+	unsigned int shrinker_run_no;
 };
 
 /*
@@ -1951,6 +1956,14 @@ bool f2fs_empty_inline_dir(struct inode *);
 int f2fs_read_inline_dir(struct file *, struct dir_context *,
 						struct f2fs_str *);
 
+/*
+ * shrinker.c
+ */
+unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *);
+unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *);
+void f2fs_join_shrinker(struct f2fs_sb_info *);
+void f2fs_leave_shrinker(struct f2fs_sb_info *);
+
 /*
  * crypto support
  */
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
new file mode 100644
index 000000000000..16e9b43635c2
--- /dev/null
+++ b/fs/f2fs/shrinker.c
@@ -0,0 +1,104 @@
+/*
+ * f2fs shrinker support
+ *   the basic infra was copied from fs/ubifs/shrinker.c
+ *
+ * Copyright (c) 2015 Motorola Mobility
+ * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+
+static LIST_HEAD(f2fs_list);
+static DEFINE_SPINLOCK(f2fs_list_lock);
+static unsigned int shrinker_run_no;
+
+unsigned long f2fs_shrink_count(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct f2fs_sb_info *sbi;
+	struct list_head *p;
+	unsigned long count = 0;
+
+	spin_lock(&f2fs_list_lock);
+	p = f2fs_list.next;
+	while (p != &f2fs_list) {
+		sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+		/* stop f2fs_put_super */
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+		spin_unlock(&f2fs_list_lock);
+
+		/* TODO: count # of objects */
+
+		spin_lock(&f2fs_list_lock);
+		p = p->next;
+		mutex_unlock(&sbi->umount_mutex);
+	}
+	spin_unlock(&f2fs_list_lock);
+	return count;
+}
+
+unsigned long f2fs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	unsigned long nr = sc->nr_to_scan;
+	struct f2fs_sb_info *sbi;
+	struct list_head *p;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&f2fs_list_lock);
+	do {
+		run_no = ++shrinker_run_no;
+	} while (run_no == 0);
+	p = f2fs_list.next;
+	while (p != &f2fs_list) {
+		sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		/* stop f2fs_put_super */
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+		spin_unlock(&f2fs_list_lock);
+
+		sbi->shrinker_run_no = run_no;
+
+		/* TODO: shrink caches */
+
+		spin_lock(&f2fs_list_lock);
+		p = p->next;
+		list_move_tail(&sbi->s_list, &f2fs_list);
+		mutex_unlock(&sbi->umount_mutex);
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&f2fs_list_lock);
+	return freed;
+}
+
+void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
+{
+	spin_lock(&f2fs_list_lock);
+	list_add_tail(&sbi->s_list, &f2fs_list);
+	spin_unlock(&f2fs_list_lock);
+}
+
+void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
+{
+	spin_lock(&f2fs_list_lock);
+	list_del(&sbi->s_list);
+	spin_unlock(&f2fs_list_lock);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index da277100dc90..bc7684b6d57a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -39,6 +39,13 @@ static struct proc_dir_entry *f2fs_proc_root;
 static struct kmem_cache *f2fs_inode_cachep;
 static struct kset *f2fs_kset;
 
+/* f2fs-wide shrinker description */
+static struct shrinker f2fs_shrinker_info = {
+	.scan_objects = f2fs_shrink_scan,
+	.count_objects = f2fs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 enum {
 	Opt_gc_background,
 	Opt_disable_roll_forward,
@@ -500,6 +507,9 @@ static void f2fs_put_super(struct super_block *sb)
 
 	stop_gc_thread(sbi);
 
+	/* prevent remaining shrinker jobs */
+	mutex_lock(&sbi->umount_mutex);
+
 	/*
 	 * We don't need to do checkpoint when superblock is clean.
 	 * But, the previous checkpoint was not done by umount, it needs to do
@@ -523,6 +533,9 @@ static void f2fs_put_super(struct super_block *sb)
 	release_dirty_inode(sbi);
 	release_discard_addrs(sbi);
 
+	f2fs_leave_shrinker(sbi);
+	mutex_unlock(&sbi->umount_mutex);
+
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
 
@@ -972,6 +985,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 	sbi->dir_level = DEF_DIR_LEVEL;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
+
+	INIT_LIST_HEAD(&sbi->s_list);
+	mutex_init(&sbi->umount_mutex);
 }
 
 /*
@@ -1214,6 +1230,8 @@ try_onemore:
 		goto free_nm;
 	}
 
+	f2fs_join_shrinker(sbi);
+
 	/* if there are nt orphan nodes free them */
 	recover_orphan_inodes(sbi);
 
@@ -1310,7 +1328,10 @@ free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
 free_node_inode:
+	mutex_lock(&sbi->umount_mutex);
+	f2fs_leave_shrinker(sbi);
 	iput(sbi->node_inode);
+	mutex_unlock(&sbi->umount_mutex);
 free_nm:
 	destroy_node_manager(sbi);
 free_sm:
@@ -1406,13 +1427,20 @@ static int __init init_f2fs_fs(void)
 	err = f2fs_init_crypto();
 	if (err)
 		goto free_kset;
-	err = register_filesystem(&f2fs_fs_type);
+
+	err = register_shrinker(&f2fs_shrinker_info);
 	if (err)
 		goto free_crypto;
+
+	err = register_filesystem(&f2fs_fs_type);
+	if (err)
+		goto free_shrinker;
 	f2fs_create_root_stats();
 	f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
 	return 0;
 
+free_shrinker:
+	unregister_shrinker(&f2fs_shrinker_info);
 free_crypto:
 	f2fs_exit_crypto();
 free_kset:
@@ -1435,6 +1463,7 @@ static void __exit exit_f2fs_fs(void)
 {
 	remove_proc_entry("fs/f2fs", NULL);
 	f2fs_destroy_root_stats();
+	unregister_shrinker(&f2fs_shrinker_info);
 	unregister_filesystem(&f2fs_fs_type);
 	f2fs_exit_crypto();
 	destroy_extent_cache();
-- 
cgit v1.2.3


From 1b38dc8e74a366b92986755c304591e330f3c3e0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 19 Jun 2015 15:36:07 -0700
Subject: f2fs: shrink nat_cache entries

This patch registers shrinking nat_cache entries.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c     |  6 +++---
 fs/f2fs/segment.c  |  8 ++++++--
 fs/f2fs/shrinker.c | 11 +++++++++--
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7dd63b794bfb..a05eb35a372c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -328,11 +328,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	int nr = nr_shrink;
 
-	if (available_free_memory(sbi, NAT_ENTRIES))
+	if (!down_write_trylock(&nm_i->nat_tree_lock))
 		return 0;
 
-	down_write(&nm_i->nat_tree_lock);
 	while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
 		struct nat_entry *ne;
 		ne = list_first_entry(&nm_i->nat_entries,
@@ -341,7 +341,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 		nr_shrink--;
 	}
 	up_write(&nm_i->nat_tree_lock);
-	return nr_shrink;
+	return nr - nr_shrink;
 }
 
 /*
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 61b97f9cb9f6..d5ee99258cbc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -306,8 +306,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	/* try to shrink extent cache when there is no enough memory */
 	f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
 
-	/* check the # of cached NAT entries and prefree segments */
-	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
+	/* check the # of cached NAT entries */
+	if (!available_free_memory(sbi, NAT_ENTRIES))
+		try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
+
+	/* checkpoint is the only way to shrink partial cached entries */
+	if (!available_free_memory(sbi, NAT_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
 			!available_free_memory(sbi, INO_ENTRIES))
 		f2fs_sync_fs(sbi->sb, true);
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 16e9b43635c2..c4bd6ee5936c 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -18,6 +18,11 @@ static LIST_HEAD(f2fs_list);
 static DEFINE_SPINLOCK(f2fs_list_lock);
 static unsigned int shrinker_run_no;
 
+static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
+{
+	return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+}
+
 unsigned long f2fs_shrink_count(struct shrinker *shrink,
 				struct shrink_control *sc)
 {
@@ -37,7 +42,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		}
 		spin_unlock(&f2fs_list_lock);
 
-		/* TODO: count # of objects */
+		/* shrink clean nat cache entries */
+		count += __count_nat_entries(sbi);
 
 		spin_lock(&f2fs_list_lock);
 		p = p->next;
@@ -76,7 +82,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 
 		sbi->shrinker_run_no = run_no;
 
-		/* TODO: shrink caches */
+		/* shrink clean nat cache entries */
+		freed += try_to_free_nats(sbi, nr);
 
 		spin_lock(&f2fs_list_lock);
 		p = p->next;
-- 
cgit v1.2.3


From 554df79e523d14dab475eb6650cb96617256ceea Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 19 Jun 2015 13:41:23 -0700
Subject: f2fs: shrink extent_cache entries

This patch registers shrinking extent_caches.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c     | 19 +++++++++++--------
 fs/f2fs/f2fs.h     |  2 +-
 fs/f2fs/segment.c  |  3 ++-
 fs/f2fs/shrinker.c | 14 +++++++++++++-
 4 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 982a1a58efd7..55b2a79b3526 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -767,7 +767,7 @@ out:
 		update_inode_page(inode);
 }
 
-void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 {
 	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
 	struct extent_node *en, *tmp;
@@ -778,10 +778,7 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	unsigned int node_cnt = 0, tree_cnt = 0;
 
 	if (!test_opt(sbi, EXTENT_CACHE))
-		return;
-
-	if (available_free_memory(sbi, EXTENT_CACHE))
-		return;
+		return 0;
 
 	spin_lock(&sbi->extent_lock);
 	list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
@@ -791,7 +788,9 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	}
 	spin_unlock(&sbi->extent_lock);
 
-	down_read(&sbi->extent_tree_lock);
+	if (!down_read_trylock(&sbi->extent_tree_lock))
+		goto out;
+
 	while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
 				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
 		unsigned i;
@@ -809,7 +808,9 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	}
 	up_read(&sbi->extent_tree_lock);
 
-	down_write(&sbi->extent_tree_lock);
+	if (!down_write_trylock(&sbi->extent_tree_lock))
+		goto out;
+
 	radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
 							F2FS_ROOT_INO(sbi)) {
 		struct extent_tree *et = (struct extent_tree *)*slot;
@@ -822,8 +823,10 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 		}
 	}
 	up_write(&sbi->extent_tree_lock);
-
+out:
 	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+
+	return node_cnt + tree_cnt;
 }
 
 void f2fs_destroy_extent_tree(struct inode *inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e82af8c7ee8c..eeef3eb45f8e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1754,7 +1754,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *);
 void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
 void f2fs_destroy_extent_tree(struct inode *);
 void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
 void f2fs_update_extent_cache(struct dnode_of_data *);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d5ee99258cbc..f7bfc3b7d934 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -304,7 +304,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
 void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 {
 	/* try to shrink extent cache when there is no enough memory */
-	f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+	if (!available_free_memory(sbi, EXTENT_CACHE))
+		f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
 
 	/* check the # of cached NAT entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES))
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index c4bd6ee5936c..1f0a131be3d2 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -23,6 +23,11 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 	return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
 }
 
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+{
+	return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node);
+}
+
 unsigned long f2fs_shrink_count(struct shrinker *shrink,
 				struct shrink_control *sc)
 {
@@ -42,6 +47,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		}
 		spin_unlock(&f2fs_list_lock);
 
+		/* count extent cache entries */
+		count += __count_extent_cache(sbi);
+
 		/* shrink clean nat cache entries */
 		count += __count_nat_entries(sbi);
 
@@ -82,8 +90,12 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 
 		sbi->shrinker_run_no = run_no;
 
+		/* shrink extent cache entries */
+		freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+
 		/* shrink clean nat cache entries */
-		freed += try_to_free_nats(sbi, nr);
+		if (freed < nr)
+			freed += try_to_free_nats(sbi, nr - freed);
 
 		spin_lock(&f2fs_list_lock);
 		p = p->next;
-- 
cgit v1.2.3


From 7daaea256de42da112805703e3c77f08973156b3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 25 Jun 2015 17:43:04 -0700
Subject: f2fs: add noextent_cache mount option

This patch adds noextent_cache mount option.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt | 4 +++-
 fs/f2fs/super.c                    | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index e9e750e59efc..e2d5105b7214 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -143,7 +143,9 @@ fastboot               This option is used when a system wants to reduce mount
 extent_cache           Enable an extent cache based on rb-tree, it can cache
                        as many as extent which map between contiguous logical
                        address and physical address per inode, resulting in
-                       increasing the cache hit ratio.
+                       increasing the cache hit ratio. Set by default.
+noextent_cache         Diable an extent cache based on rb-tree explicitly, see
+                       the above extent_cache mount option.
 noinline_data          Disable the inline data feature, inline data feature is
                        enabled by default.
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bc7684b6d57a..92520228ce71 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -65,6 +65,7 @@ enum {
 	Opt_nobarrier,
 	Opt_fastboot,
 	Opt_extent_cache,
+	Opt_noextent_cache,
 	Opt_noinline_data,
 	Opt_err,
 };
@@ -88,6 +89,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_fastboot, "fastboot"},
 	{Opt_extent_cache, "extent_cache"},
+	{Opt_noextent_cache, "noextent_cache"},
 	{Opt_noinline_data, "noinline_data"},
 	{Opt_err, NULL},
 };
@@ -389,6 +391,9 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_extent_cache:
 			set_opt(sbi, EXTENT_CACHE);
 			break;
+		case Opt_noextent_cache:
+			clear_opt(sbi, EXTENT_CACHE);
+			break;
 		case Opt_noinline_data:
 			clear_opt(sbi, INLINE_DATA);
 			break;
@@ -662,6 +667,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",fastboot");
 	if (test_opt(sbi, EXTENT_CACHE))
 		seq_puts(seq, ",extent_cache");
+	else
+		seq_puts(seq, ",noextent_cache");
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
 
 	return 0;
-- 
cgit v1.2.3


From 3e72f721390dc14e7b33fda812843c0725810106 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 19 Jun 2015 17:53:26 -0700
Subject: f2fs: use extent_cache by default

We don't need to handle the duplicate extent information.

The integrated rule is:
 - update on-disk extent with largest one tracked by in-memory extent_cache
 - destroy extent_tree for the truncation case
 - drop per-inode extent_cache by shrinker

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c     | 357 ++++++++++++++++-------------------------------------
 fs/f2fs/f2fs.h     |  20 ++-
 fs/f2fs/inode.c    |  18 ++-
 fs/f2fs/namei.c    |   2 +
 fs/f2fs/shrinker.c |   2 +
 fs/f2fs/super.c    |   8 +-
 6 files changed, 142 insertions(+), 265 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 55b2a79b3526..be0945cd9808 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -266,103 +266,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	return err;
 }
 
-static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
-							struct extent_info *ei)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	pgoff_t start_fofs, end_fofs;
-	block_t start_blkaddr;
-
-	read_lock(&fi->ext_lock);
-	if (fi->ext.len == 0) {
-		read_unlock(&fi->ext_lock);
-		return false;
-	}
-
-	stat_inc_total_hit(inode->i_sb);
-
-	start_fofs = fi->ext.fofs;
-	end_fofs = fi->ext.fofs + fi->ext.len - 1;
-	start_blkaddr = fi->ext.blk;
-
-	if (pgofs >= start_fofs && pgofs <= end_fofs) {
-		*ei = fi->ext;
-		stat_inc_read_hit(inode->i_sb);
-		read_unlock(&fi->ext_lock);
-		return true;
-	}
-	read_unlock(&fi->ext_lock);
-	return false;
-}
-
-static bool update_extent_info(struct inode *inode, pgoff_t fofs,
-								block_t blkaddr)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	pgoff_t start_fofs, end_fofs;
-	block_t start_blkaddr, end_blkaddr;
-	int need_update = true;
-
-	write_lock(&fi->ext_lock);
-
-	start_fofs = fi->ext.fofs;
-	end_fofs = fi->ext.fofs + fi->ext.len - 1;
-	start_blkaddr = fi->ext.blk;
-	end_blkaddr = fi->ext.blk + fi->ext.len - 1;
-
-	/* Drop and initialize the matched extent */
-	if (fi->ext.len == 1 && fofs == start_fofs)
-		fi->ext.len = 0;
-
-	/* Initial extent */
-	if (fi->ext.len == 0) {
-		if (blkaddr != NULL_ADDR) {
-			fi->ext.fofs = fofs;
-			fi->ext.blk = blkaddr;
-			fi->ext.len = 1;
-		}
-		goto end_update;
-	}
-
-	/* Front merge */
-	if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) {
-		fi->ext.fofs--;
-		fi->ext.blk--;
-		fi->ext.len++;
-		goto end_update;
-	}
-
-	/* Back merge */
-	if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) {
-		fi->ext.len++;
-		goto end_update;
-	}
-
-	/* Split the existing extent */
-	if (fi->ext.len > 1 &&
-		fofs >= start_fofs && fofs <= end_fofs) {
-		if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
-			fi->ext.len = fofs - start_fofs;
-		} else {
-			fi->ext.fofs = fofs + 1;
-			fi->ext.blk = start_blkaddr + fofs - start_fofs + 1;
-			fi->ext.len -= fofs - start_fofs + 1;
-		}
-	} else {
-		need_update = false;
-	}
-
-	/* Finally, if the extent is very fragmented, let's drop the cache. */
-	if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
-		fi->ext.len = 0;
-		set_inode_flag(fi, FI_NO_EXTENT);
-		need_update = true;
-	}
-end_update:
-	write_unlock(&fi->ext_lock);
-	return need_update;
-}
-
 static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_info *ei,
 				struct rb_node *parent, struct rb_node **p)
@@ -394,23 +297,6 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
 		et->cached_en = NULL;
 }
 
-static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi,
-							nid_t ino)
-{
-	struct extent_tree *et;
-
-	down_read(&sbi->extent_tree_lock);
-	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
-	if (!et) {
-		up_read(&sbi->extent_tree_lock);
-		return NULL;
-	}
-	atomic_inc(&et->refcount);
-	up_read(&sbi->extent_tree_lock);
-
-	return et;
-}
-
 static struct extent_tree *__grab_extent_tree(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -434,6 +320,9 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	atomic_inc(&et->refcount);
 	up_write(&sbi->extent_tree_lock);
 
+	/* never died untill evict_inode */
+	F2FS_I(inode)->extent_tree = et;
+
 	return et;
 }
 
@@ -522,7 +411,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
 				en->ei.blk = ei->blk;
 				en->ei.len += ei->len;
 				*den = __try_back_merge(sbi, et, en);
-				return en;
+				goto update_out;
 			}
 			p = &(*p)->rb_left;
 		} else if (ei->fofs >= en->ei.fofs + en->ei.len) {
@@ -530,7 +419,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
 				f2fs_bug_on(sbi, !den);
 				en->ei.len += ei->len;
 				*den = __try_front_merge(sbi, et, en);
-				return en;
+				goto update_out;
 			}
 			p = &(*p)->rb_right;
 		} else {
@@ -538,7 +427,14 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
 		}
 	}
 
-	return __attach_extent_node(sbi, et, ei, parent, p);
+	en = __attach_extent_node(sbi, et, ei, parent, p);
+	if (!en)
+		return NULL;
+update_out:
+	if (en->ei.len > et->largest.len)
+		et->largest = en->ei;
+	et->cached_en = en;
+	return en;
 }
 
 static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
@@ -570,51 +466,56 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 	return count - et->count;
 }
 
-static void f2fs_init_extent_tree(struct inode *inode,
-						struct f2fs_extent *i_ext)
+static void __drop_largest_extent(struct inode *inode, pgoff_t fofs)
+{
+	struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
+
+	if (largest->fofs <= fofs && largest->fofs + largest->len > fofs)
+		largest->len = 0;
+}
+
+void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
+	if (!f2fs_may_extent_tree(inode))
 		return;
 
 	et = __grab_extent_tree(inode);
 
-	write_lock(&et->lock);
-	if (et->count)
-		goto out;
+	if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
+		return;
 
 	set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
 		le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
 
+	write_lock(&et->lock);
+	if (et->count)
+		goto out;
+
 	en = __insert_extent_tree(sbi, et, &ei, NULL);
 	if (en) {
-		et->cached_en = en;
-
 		spin_lock(&sbi->extent_lock);
 		list_add_tail(&en->list, &sbi->extent_list);
 		spin_unlock(&sbi->extent_lock);
 	}
 out:
 	write_unlock(&et->lock);
-	atomic_dec(&et->refcount);
 }
 
 static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 							struct extent_info *ei)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	struct extent_node *en;
 
-	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+	f2fs_bug_on(sbi, !et);
 
-	et = __find_extent_tree(sbi, inode->i_ino);
-	if (!et)
-		return false;
+	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
 
 	read_lock(&et->lock);
 	en = __lookup_extent_tree(et, pgofs);
@@ -631,27 +532,38 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 	read_unlock(&et->lock);
 
 	trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
-
-	atomic_dec(&et->refcount);
 	return en ? true : false;
 }
 
-static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
+/* return true, if on-disk extent should be updated */
+static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 							block_t blkaddr)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
 	struct extent_node *den = NULL;
-	struct extent_info ei, dei;
+	struct extent_info ei, dei, prev;
 	unsigned int endofs;
 
-	trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
+	if (!et)
+		return false;
 
-	et = __grab_extent_tree(inode);
+	trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
 
 	write_lock(&et->lock);
 
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
+		write_unlock(&et->lock);
+		return false;
+	}
+
+	prev = et->largest;
+	dei.len = 0;
+
+	/* we do not guarantee that the largest extent is cached all the time */
+	__drop_largest_extent(inode, fofs);
+
 	/* 1. lookup and remove existing extent info in cache */
 	en = __lookup_extent_tree(et, fofs);
 	if (!en)
@@ -683,6 +595,14 @@ update_extent:
 	if (blkaddr) {
 		set_extent_info(&ei, fofs, blkaddr, 1);
 		en3 = __insert_extent_tree(sbi, et, &ei, &den);
+
+		/* give up extent_cache, if split and small updates happen */
+		if (dei.len >= 1 &&
+				prev.len < F2FS_MIN_EXTENT_LEN &&
+				et->largest.len < F2FS_MIN_EXTENT_LEN) {
+			et->largest.len = 0;
+			set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
+		}
 	}
 
 	/* 4. update in global extent list */
@@ -714,57 +634,12 @@ update_extent:
 	if (den)
 		kmem_cache_free(extent_node_slab, den);
 
-	write_unlock(&et->lock);
-	atomic_dec(&et->refcount);
-}
-
-void f2fs_preserve_extent_tree(struct inode *inode)
-{
-	struct extent_tree *et;
-	struct extent_info *ext = &F2FS_I(inode)->ext;
-	bool sync = false;
-
-	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
-		return;
-
-	et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino);
-	if (!et) {
-		if (ext->len) {
-			ext->len = 0;
-			update_inode_page(inode);
-		}
-		return;
-	}
-
-	read_lock(&et->lock);
-	if (et->count) {
-		struct extent_node *en;
-
-		if (et->cached_en) {
-			en = et->cached_en;
-		} else {
-			struct rb_node *node = rb_first(&et->root);
-
-			if (!node)
-				node = rb_last(&et->root);
-			en = rb_entry(node, struct extent_node, rb_node);
-		}
-
-		if (__is_extent_same(ext, &en->ei))
-			goto out;
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+		__free_extent_tree(sbi, et, true);
 
-		*ext = en->ei;
-		sync = true;
-	} else if (ext->len) {
-		ext->len = 0;
-		sync = true;
-	}
-out:
-	read_unlock(&et->lock);
-	atomic_dec(&et->refcount);
+	write_unlock(&et->lock);
 
-	if (sync)
-		update_inode_page(inode);
+	return !__is_extent_same(&prev, &et->largest);
 }
 
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -772,8 +647,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
 	struct extent_node *en, *tmp;
 	unsigned long ino = F2FS_ROOT_INO(sbi);
-	struct radix_tree_iter iter;
-	void **slot;
+	struct radix_tree_root *root = &sbi->extent_tree_root;
 	unsigned int found;
 	unsigned int node_cnt = 0, tree_cnt = 0;
 
@@ -788,10 +662,10 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	}
 	spin_unlock(&sbi->extent_lock);
 
-	if (!down_read_trylock(&sbi->extent_tree_lock))
+	if (!down_write_trylock(&sbi->extent_tree_lock))
 		goto out;
 
-	while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
+	while ((found = radix_tree_gang_lookup(root,
 				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
 		unsigned i;
 
@@ -799,27 +673,15 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 		for (i = 0; i < found; i++) {
 			struct extent_tree *et = treevec[i];
 
-			atomic_inc(&et->refcount);
 			write_lock(&et->lock);
 			node_cnt += __free_extent_tree(sbi, et, false);
 			write_unlock(&et->lock);
-			atomic_dec(&et->refcount);
-		}
-	}
-	up_read(&sbi->extent_tree_lock);
-
-	if (!down_write_trylock(&sbi->extent_tree_lock))
-		goto out;
-
-	radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
-							F2FS_ROOT_INO(sbi)) {
-		struct extent_tree *et = (struct extent_tree *)*slot;
-
-		if (!atomic_read(&et->refcount) && !et->count) {
-			radix_tree_delete(&sbi->extent_tree_root, et->ino);
-			kmem_cache_free(extent_tree_slab, et);
-			sbi->total_ext_tree--;
-			tree_cnt++;
+			if (!atomic_read(&et->refcount) && !et->count) {
+				radix_tree_delete(root, et->ino);
+				kmem_cache_free(extent_tree_slab, et);
+				sbi->total_ext_tree--;
+				tree_cnt++;
+			}
 		}
 	}
 	up_write(&sbi->extent_tree_lock);
@@ -829,63 +691,61 @@ out:
 	return node_cnt + tree_cnt;
 }
 
-void f2fs_destroy_extent_tree(struct inode *inode)
+unsigned int f2fs_destroy_extent_node(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	unsigned int node_cnt = 0;
 
-	if (!test_opt(sbi, EXTENT_CACHE))
-		return;
-
-	et = __find_extent_tree(sbi, inode->i_ino);
 	if (!et)
-		goto out;
+		return 0;
 
-	/* free all extent info belong to this extent tree */
 	write_lock(&et->lock);
 	node_cnt = __free_extent_tree(sbi, et, true);
 	write_unlock(&et->lock);
 
-	atomic_dec(&et->refcount);
+	return node_cnt;
+}
 
-	/* try to find and delete extent tree entry in radix tree */
-	down_write(&sbi->extent_tree_lock);
-	et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino);
-	if (!et) {
-		up_write(&sbi->extent_tree_lock);
-		goto out;
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	unsigned int node_cnt = 0;
+
+	if (!et)
+		return;
+
+	if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
+		atomic_dec(&et->refcount);
+		return;
 	}
+
+	/* free all extent info belong to this extent tree */
+	node_cnt = f2fs_destroy_extent_node(inode);
+
+	/* delete extent tree entry in radix tree */
+	down_write(&sbi->extent_tree_lock);
+	atomic_dec(&et->refcount);
 	f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
 	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
 	kmem_cache_free(extent_tree_slab, et);
 	sbi->total_ext_tree--;
 	up_write(&sbi->extent_tree_lock);
-out:
-	trace_f2fs_destroy_extent_tree(inode, node_cnt);
-	return;
-}
 
-void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext)
-{
-	if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
-		f2fs_init_extent_tree(inode, i_ext);
+	F2FS_I(inode)->extent_tree = NULL;
 
-	write_lock(&F2FS_I(inode)->ext_lock);
-	get_extent_info(&F2FS_I(inode)->ext, *i_ext);
-	write_unlock(&F2FS_I(inode)->ext_lock);
+	trace_f2fs_destroy_extent_tree(inode, node_cnt);
+	return;
 }
 
 static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
 							struct extent_info *ei)
 {
-	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+	if (!f2fs_may_extent_tree(inode))
 		return false;
 
-	if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
-		return f2fs_lookup_extent_tree(inode, pgofs, ei);
-
-	return lookup_extent_info(inode, pgofs, ei);
+	return f2fs_lookup_extent_tree(inode, pgofs, ei);
 }
 
 void f2fs_update_extent_cache(struct dnode_of_data *dn)
@@ -893,19 +753,15 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
 	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
 	pgoff_t fofs;
 
-	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-
-	if (is_inode_flag_set(fi, FI_NO_EXTENT))
+	if (!f2fs_may_extent_tree(dn->inode))
 		return;
 
+	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
 							dn->ofs_in_node;
 
-	/* we should call update_extent_info() to update on-disk extent */
-	if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
-		f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr);
-
-	if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
+	if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr))
 		sync_inode_page(dn);
 }
 
@@ -1109,8 +965,6 @@ alloc:
 
 	allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
 								&sum, seg);
-
-	/* direct IO doesn't use extent cache to maximize the performance */
 	set_data_blkaddr(dn);
 
 	/* update i_size */
@@ -1119,6 +973,9 @@ alloc:
 	if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
 		i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
 
+	/* direct IO doesn't use extent cache to maximize the performance */
+	__drop_largest_extent(dn->inode, fofs);
+
 	return 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eeef3eb45f8e..1e6f54d8b464 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -349,6 +349,7 @@ struct extent_tree {
 	nid_t ino;			/* inode number */
 	struct rb_root root;		/* root of extent info rb-tree */
 	struct extent_node *cached_en;	/* recently accessed extent node */
+	struct extent_info largest;	/* largested extent info */
 	rwlock_t lock;			/* protect extent info rb-tree */
 	atomic_t refcount;		/* reference count of rb-tree */
 	unsigned int count;		/* # of extent node in rb-tree*/
@@ -420,14 +421,14 @@ struct f2fs_inode_info {
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
 	unsigned long long xattr_ver;	/* cp version of xattr modification */
-	struct extent_info ext;		/* in-memory extent cache entry */
-	rwlock_t ext_lock;		/* rwlock for single extent cache */
 	struct inode_entry *dirty_dir;	/* the pointer of dirty dir */
 
 	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
 
+	struct extent_tree *extent_tree;	/* cached extent_tree entry */
+
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 	/* Encryption params */
 	struct f2fs_crypt_info *i_crypt_info;
@@ -1548,6 +1549,17 @@ static inline bool is_dot_dotdot(const struct qstr *str)
 	return false;
 }
 
+static inline bool f2fs_may_extent_tree(struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+			is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+		return false;
+
+	return S_ISREG(mode);
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1755,10 +1767,10 @@ void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+unsigned int f2fs_destroy_extent_node(struct inode *);
 void f2fs_destroy_extent_tree(struct inode *);
-void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
 void f2fs_update_extent_cache(struct dnode_of_data *);
-void f2fs_preserve_extent_tree(struct inode *);
 struct page *get_read_data_page(struct inode *, pgoff_t, int);
 struct page *find_data_page(struct inode *, pgoff_t);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 757fed253697..978a7261a791 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -139,7 +139,7 @@ static int do_read_inode(struct inode *inode)
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	fi->i_dir_level = ri->i_dir_level;
 
-	f2fs_init_extent_cache(inode, &ri->i_ext);
+	f2fs_init_extent_tree(inode, &ri->i_ext);
 
 	get_inline_info(fi, ri);
 
@@ -237,10 +237,11 @@ void update_inode(struct inode *inode, struct page *node_page)
 	ri->i_size = cpu_to_le64(i_size_read(inode));
 	ri->i_blocks = cpu_to_le64(inode->i_blocks);
 
-	read_lock(&F2FS_I(inode)->ext_lock);
-	set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
-	read_unlock(&F2FS_I(inode)->ext_lock);
-
+	if (F2FS_I(inode)->extent_tree)
+		set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
+							&ri->i_ext);
+	else
+		memset(&ri->i_ext, 0, sizeof(ri->i_ext));
 	set_raw_inline(F2FS_I(inode), ri);
 
 	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -331,6 +332,8 @@ void f2fs_evict_inode(struct inode *inode)
 	f2fs_bug_on(sbi, get_dirty_pages(inode));
 	remove_dirty_dir_inode(inode);
 
+	f2fs_destroy_extent_tree(inode);
+
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
@@ -350,11 +353,6 @@ no_delete:
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
-	/* update extent info in inode */
-	if (inode->i_nlink)
-		f2fs_preserve_extent_tree(inode);
-	f2fs_destroy_extent_tree(inode);
-
 	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 08656fca8f83..df315dcdd35d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -65,6 +65,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
 
+	f2fs_init_extent_tree(inode, NULL);
+
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
 
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 1f0a131be3d2..9aa4235cd304 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -117,6 +117,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
 
 void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 {
+	f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+
 	spin_lock(&f2fs_list_lock);
 	list_del(&sbi->s_list);
 	spin_unlock(&f2fs_list_lock);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 92520228ce71..0083b8559c9b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -422,7 +422,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
-	rwlock_init(&fi->ext_lock);
 	init_rwsem(&fi->i_sem);
 	INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
 	INIT_LIST_HEAD(&fi->inmem_pages);
@@ -453,12 +452,17 @@ static int f2fs_drop_inode(struct inode *inode)
 	 */
 	if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
 		if (!inode->i_nlink && !is_bad_inode(inode)) {
+			/* to avoid evict_inode call simultaneously */
+			atomic_inc(&inode->i_count);
 			spin_unlock(&inode->i_lock);
 
 			/* some remained atomic pages should discarded */
 			if (f2fs_is_atomic_file(inode))
 				commit_inmem_pages(inode, true);
 
+			/* should remain fi->extent_tree for writepage */
+			f2fs_destroy_extent_node(inode);
+
 			sb_start_intwrite(inode->i_sb);
 			i_size_write(inode, 0);
 
@@ -473,6 +477,7 @@ static int f2fs_drop_inode(struct inode *inode)
 					F2FS_I(inode)->i_crypt_info);
 #endif
 			spin_lock(&inode->i_lock);
+			atomic_dec(&inode->i_count);
 		}
 		return 0;
 	}
@@ -721,6 +726,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_DATA);
+	set_opt(sbi, EXTENT_CACHE);
 
 #ifdef CONFIG_F2FS_FS_XATTR
 	set_opt(sbi, XATTR_USER);
-- 
cgit v1.2.3


From 84bc926c076963d5b992640f5c8d242754801fd6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 29 Jun 2015 16:01:14 -0700
Subject: f2fs: check the largest extent at look-up time

Because of the extent shrinker or other -ENOMEM scenarios, it cannot guarantee
that the largest extent would be cached in the tree all the time.

Instead of relying on extent_tree, we can simply check the cached one in extent
tree accordingly.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c              | 16 ++++++++++++++--
 include/trace/events/f2fs.h | 12 ++++++------
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index be0945cd9808..cdc1c2b781b8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -512,12 +512,22 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	struct extent_node *en;
+	bool ret = false;
 
 	f2fs_bug_on(sbi, !et);
 
 	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
 
 	read_lock(&et->lock);
+
+	if (et->largest.fofs <= pgofs &&
+			et->largest.fofs + et->largest.len > pgofs) {
+		*ei = et->largest;
+		ret = true;
+		stat_inc_read_hit(sbi->sb);
+		goto out;
+	}
+
 	en = __lookup_extent_tree(et, pgofs);
 	if (en) {
 		*ei = en->ei;
@@ -526,13 +536,15 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 			list_move_tail(&en->list, &sbi->extent_list);
 		et->cached_en = en;
 		spin_unlock(&sbi->extent_lock);
+		ret = true;
 		stat_inc_read_hit(sbi->sb);
 	}
+out:
 	stat_inc_total_hit(sbi->sb);
 	read_unlock(&et->lock);
 
-	trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
-	return en ? true : false;
+	trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+	return ret;
 }
 
 /* return true, if on-disk extent should be updated */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 04856a2d8c82..a01946514b5a 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1099,11 +1099,11 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start,
 TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end,
 
 	TP_PROTO(struct inode *inode, unsigned int pgofs,
-						struct extent_node *en),
+						struct extent_info *ei),
 
-	TP_ARGS(inode, pgofs, en),
+	TP_ARGS(inode, pgofs, ei),
 
-	TP_CONDITION(en),
+	TP_CONDITION(ei),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
@@ -1118,9 +1118,9 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end,
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
 		__entry->pgofs = pgofs;
-		__entry->fofs = en->ei.fofs;
-		__entry->blk = en->ei.blk;
-		__entry->len = en->ei.len;
+		__entry->fofs = ei->fofs;
+		__entry->blk = ei->blk;
+		__entry->len = ei->len;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
-- 
cgit v1.2.3


From c1079892f4e8ecfd1bbc525cbfc1bd46b470888e Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Tue, 30 Jun 2015 21:37:21 -0400
Subject: f2fs: make the function check_dnode have a return type of bool and
 change it's name to is_alive

This makes the function check_dnode have a return type of bool
due to this particular function only ever returning either one
or zero as its return value and changes the name of the function
to is_alive in order to better explain this function's intended
work of checking if a dnode is still in use by the filesystem.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
[Jaegeuk Kim: change the return value check for the renamed function]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 22fb5ef37966..2701e05af991 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -487,7 +487,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
 	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
 }
 
-static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		struct node_info *dni, block_t blkaddr, unsigned int *nofs)
 {
 	struct page *node_page;
@@ -500,13 +500,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
 	node_page = get_node_page(sbi, nid);
 	if (IS_ERR(node_page))
-		return 0;
+		return false;
 
 	get_node_info(sbi, nid, dni);
 
 	if (sum->version != dni->version) {
 		f2fs_put_page(node_page, 1);
-		return 0;
+		return false;
 	}
 
 	*nofs = ofs_of_node(node_page);
@@ -514,8 +514,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	f2fs_put_page(node_page, 1);
 
 	if (source_blkaddr != blkaddr)
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 static void move_encrypted_block(struct inode *inode, block_t bidx)
@@ -670,7 +670,7 @@ next_step:
 		}
 
 		/* Get an inode by ino with checking validity */
-		if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
+		if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
 			continue;
 
 		if (phase == 1) {
-- 
cgit v1.2.3


From 741a7bea79eae6361c8d7499f1f6a900b65c120e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 6 Jul 2015 20:30:40 +0800
Subject: f2fs: restrict multimedia filename

When testing with fs_mark, some blocks were written out as cold
data which were mixed with warm data, resulting in splitting more
bios.

This is because fs_mark will create file with random filename as
below:

559551ee~~~~~~~~15Z29OCC05JCKQP60JQ42MKV
559551ee~~~~~~~~NZAZ6X8OA8LHIIP6XD0L58RM
559551ef~~~~~~~~B15YDSWAK789HPSDZKYTW6WM
559551f1~~~~~~~~2DAE5DPS79785BUNTFWBEMP3
559551f1~~~~~~~~1MYDY0BKSQCJPI32Q8C514RM
559551f1~~~~~~~~YQOTMAOMN5CVRFOUNI026MP4
559551f3~~~~~~~~1WF42LPRTQJNPPGR3EINKMPE
559551f3~~~~~~~~8Y2NRK7CEPPAA02LY936PJPG

They are regarded as cold file since their filename are ended with
multimedia files' extension, but this should be wrong as we only
match the extension of filename, not the whole one.

In this patch, we try to fix the format of multimedia filename to:
"filename + '.' + extension", then we set cold file only its
filename matches the format.

So after this change, it will reduce the probability we set the
wrong cold file, also it helps a little for fs_mark's performance
on f2fs.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index df315dcdd35d..1856d5ecd809 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -91,7 +91,14 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
 	size_t slen = strlen(s);
 	size_t sublen = strlen(sub);
 
-	if (sublen > slen)
+	/*
+	 * filename format of multimedia file should be defined as:
+	 * "filename + '.' + extension".
+	 */
+	if (slen < sublen + 2)
+		return 0;
+
+	if (s[slen - sublen - 1] != '.')
 		return 0;
 
 	return !strncasecmp(s + slen - sublen, sub, sublen);
-- 
cgit v1.2.3


From bb96a8d51e523c162b436c4545eb1a4e9f9f530e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 6 Jul 2015 20:31:49 +0800
Subject: f2fs: enhance multithread performance

In ->writepages, we use writepages mutex lock to serialize all block
address allocation and page submitting pairs from different inodes.
This method makes our delayed dirty pages of one inode being written
continously as many as possible.

But there is one problem that we did not submit current cached bio in
protection region of writepages mutex lock, so there is a small chance
that we submit the one of other thread's as below, resulting in
splitting more bios.

thread 1			thread 2
->writepages
  lock(writepages)
  ->write_cache_pages
  unlock(writepages)
				  lock(writepages)
				  ->write_cache_pages
  ->f2fs_submit_merged_bio
				    ->writepage
				  unlock(writepages)

fs_mark-6535  [002] ....  2242.270230: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, DATA, sector = 5766152, size = 524288
fs_mark-6536  [000] ....  2242.270361: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, DATA, sector = 5767176, size = 4096
fs_mark-6536  [000] ....  2242.270370: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, NODE, sector = 8138112, size = 4096
fs_mark-6535  [002] ....  2242.270776: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, DATA, sector = 5767184, size = 516096

This may really increase time of block layer works, and may cause
larger IO lantency.

This patch moves the submitting operation into region of writepages
mutex lock to avoid bio splits when concurrently writebacking is
intensive.

my test environment: virtual machine,
intel cpu i5 2500, 8GB size memory, 4GB size ramdisk

time fs_mark  -t  16  -L  1  -s  524288  -S  1  -d  /mnt/f2fs/

before:
real	0m4.244s
user	0m0.088s
sys	0m12.336s

after:
real	0m3.822s
user	0m0.072s
sys	0m10.760s

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cdc1c2b781b8..3e4402f661d7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1672,11 +1672,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 		locked = true;
 	}
 	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	if (locked)
 		mutex_unlock(&sbi->writepages);
 
-	f2fs_submit_merged_bio(sbi, DATA, WRITE);
-
 	remove_dirty_dir_inode(inode);
 
 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
-- 
cgit v1.2.3


From 7023a1ad17f4e21acb74167ab647cd123d9eb801 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 29 Jun 2015 16:34:39 -0700
Subject: f2fs: shrink unreferenced extent_caches first

If an extent_tree entry has a zero reference count, we can drop it from the
cache in higher priority rather than currently referencing entries.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 51 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 3e4402f661d7..c9d0f8b06d15 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -662,21 +662,54 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct radix_tree_root *root = &sbi->extent_tree_root;
 	unsigned int found;
 	unsigned int node_cnt = 0, tree_cnt = 0;
+	int remained;
 
 	if (!test_opt(sbi, EXTENT_CACHE))
 		return 0;
 
+	if (!down_write_trylock(&sbi->extent_tree_lock))
+		goto out;
+
+	/* 1. remove unreferenced extent tree */
+	while ((found = radix_tree_gang_lookup(root,
+				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+		unsigned i;
+
+		ino = treevec[found - 1]->ino + 1;
+		for (i = 0; i < found; i++) {
+			struct extent_tree *et = treevec[i];
+
+			if (!atomic_read(&et->refcount)) {
+				write_lock(&et->lock);
+				node_cnt += __free_extent_tree(sbi, et, true);
+				write_unlock(&et->lock);
+
+				radix_tree_delete(root, et->ino);
+				kmem_cache_free(extent_tree_slab, et);
+				sbi->total_ext_tree--;
+				tree_cnt++;
+
+				if (node_cnt + tree_cnt >= nr_shrink)
+					goto unlock_out;
+			}
+		}
+	}
+	up_write(&sbi->extent_tree_lock);
+
+	/* 2. remove LRU extent entries */
+	if (!down_write_trylock(&sbi->extent_tree_lock))
+		goto out;
+
+	remained = nr_shrink - (node_cnt + tree_cnt);
+
 	spin_lock(&sbi->extent_lock);
 	list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
-		if (!nr_shrink--)
+		if (!remained--)
 			break;
 		list_del_init(&en->list);
 	}
 	spin_unlock(&sbi->extent_lock);
 
-	if (!down_write_trylock(&sbi->extent_tree_lock))
-		goto out;
-
 	while ((found = radix_tree_gang_lookup(root,
 				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
 		unsigned i;
@@ -688,14 +721,12 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 			write_lock(&et->lock);
 			node_cnt += __free_extent_tree(sbi, et, false);
 			write_unlock(&et->lock);
-			if (!atomic_read(&et->refcount) && !et->count) {
-				radix_tree_delete(root, et->ino);
-				kmem_cache_free(extent_tree_slab, et);
-				sbi->total_ext_tree--;
-				tree_cnt++;
-			}
+
+			if (node_cnt + tree_cnt >= nr_shrink)
+				break;
 		}
 	}
+unlock_out:
 	up_write(&sbi->extent_tree_lock);
 out:
 	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
-- 
cgit v1.2.3


From 90d4388ac2cec0c83cad7315d3cd0065553430e1 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 8 Jul 2015 18:24:38 +0800
Subject: f2fs: fix to update page flag

This patch fixes to update page flag (e.g. Uptodate/cold flag) in
->write_begin.

Otherwise, page will be non-uptodate when we try to write entire
page, and cold data flag in page will not be clean when gced page
is being rewritten.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c9d0f8b06d15..de55c088948f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1790,8 +1790,10 @@ put_next:
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
 
-	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
-		return 0;
+	if (len == PAGE_CACHE_SIZE)
+		goto out_update;
+	if (PageUptodate(page))
+		goto out_clear;
 
 	f2fs_wait_on_page_writeback(page, DATA);
 
@@ -1801,7 +1803,7 @@ put_next:
 
 		/* Reading beyond i_size is simple: memset to zero */
 		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
-		goto out;
+		goto out_update;
 	}
 
 	if (dn.data_blkaddr == NEW_ADDR) {
@@ -1839,8 +1841,9 @@ put_next:
 			}
 		}
 	}
-out:
+out_update:
 	SetPageUptodate(page);
+out_clear:
 	clear_cold_data(page);
 	return 0;
 
-- 
cgit v1.2.3


From 3c7df87dad065a4656b13115593c59c8a324a108 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Wed, 8 Jul 2015 16:02:54 +0800
Subject: f2fs: don't try to split extents shorter than F2FS_MIN_EXTENT_LEN

Since only parts of extents longer than F2FS_MIN_EXTENT_LEN will
be kept in extent cache after split, extents already shorter than
F2FS_MIN_EXTENT_LEN don't need to try split at all.

Signed-off-by: Fan Li <fanofcode.li@samsung.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index de55c088948f..ce0d5ec8e770 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -585,7 +585,7 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 	__detach_extent_node(sbi, et, en);
 
 	/* 2. if extent can be split more, split and insert the left part */
-	if (dei.len > 1) {
+	if (dei.len > F2FS_MIN_EXTENT_LEN) {
 		/*  insert left part of split extent into cache */
 		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
 			set_extent_info(&ei, dei.fofs, dei.blk,
-- 
cgit v1.2.3


From a28ef1f5aebe1068fc5fd65c4699c1c3b1e9094b Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 8 Jul 2015 17:59:36 +0800
Subject: f2fs: maintain extent cache in separated file

This patch moves extent cache related code from data.c into extent_cache.c
since extent cache is independent feature, and its codes are not relate to
others in data.c, it's better for us to maintain them in separated place.

There is no functionality change, but several small coding style fixes
including:
* rename __drop_largest_extent to f2fs_drop_largest_extent for exporting;
* rename misspelled word 'untill' to 'until';
* remove unneeded 'return' in the end of f2fs_destroy_extent_tree().

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/Makefile       |   2 +-
 fs/f2fs/data.c         | 578 +----------------------------------------------
 fs/f2fs/extent_cache.c | 594 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/f2fs/f2fs.h         |  22 +-
 4 files changed, 610 insertions(+), 586 deletions(-)
 create mode 100644 fs/f2fs/extent_cache.c

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 005251b8d459..08e101ed914c 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
 
 f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
-f2fs-y		+= shrinker.o
+f2fs-y		+= shrinker.o extent_cache.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ce0d5ec8e770..ef30b59756c6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -26,9 +26,6 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
-static struct kmem_cache *extent_tree_slab;
-static struct kmem_cache *extent_node_slab;
-
 static void f2fs_read_end_io(struct bio *bio, int err)
 {
 	struct bio_vec *bvec;
@@ -266,548 +263,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	return err;
 }
 
-static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_info *ei,
-				struct rb_node *parent, struct rb_node **p)
-{
-	struct extent_node *en;
-
-	en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
-	if (!en)
-		return NULL;
-
-	en->ei = *ei;
-	INIT_LIST_HEAD(&en->list);
-
-	rb_link_node(&en->rb_node, parent, p);
-	rb_insert_color(&en->rb_node, &et->root);
-	et->count++;
-	atomic_inc(&sbi->total_ext_node);
-	return en;
-}
-
-static void __detach_extent_node(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_node *en)
-{
-	rb_erase(&en->rb_node, &et->root);
-	et->count--;
-	atomic_dec(&sbi->total_ext_node);
-
-	if (et->cached_en == en)
-		et->cached_en = NULL;
-}
-
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et;
-	nid_t ino = inode->i_ino;
-
-	down_write(&sbi->extent_tree_lock);
-	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
-	if (!et) {
-		et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
-		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
-		memset(et, 0, sizeof(struct extent_tree));
-		et->ino = ino;
-		et->root = RB_ROOT;
-		et->cached_en = NULL;
-		rwlock_init(&et->lock);
-		atomic_set(&et->refcount, 0);
-		et->count = 0;
-		sbi->total_ext_tree++;
-	}
-	atomic_inc(&et->refcount);
-	up_write(&sbi->extent_tree_lock);
-
-	/* never died untill evict_inode */
-	F2FS_I(inode)->extent_tree = et;
-
-	return et;
-}
-
-static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
-							unsigned int fofs)
-{
-	struct rb_node *node = et->root.rb_node;
-	struct extent_node *en;
-
-	if (et->cached_en) {
-		struct extent_info *cei = &et->cached_en->ei;
-
-		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
-			return et->cached_en;
-	}
-
-	while (node) {
-		en = rb_entry(node, struct extent_node, rb_node);
-
-		if (fofs < en->ei.fofs)
-			node = node->rb_left;
-		else if (fofs >= en->ei.fofs + en->ei.len)
-			node = node->rb_right;
-		else
-			return en;
-	}
-	return NULL;
-}
-
-static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_node *en)
-{
-	struct extent_node *prev;
-	struct rb_node *node;
-
-	node = rb_prev(&en->rb_node);
-	if (!node)
-		return NULL;
-
-	prev = rb_entry(node, struct extent_node, rb_node);
-	if (__is_back_mergeable(&en->ei, &prev->ei)) {
-		en->ei.fofs = prev->ei.fofs;
-		en->ei.blk = prev->ei.blk;
-		en->ei.len += prev->ei.len;
-		__detach_extent_node(sbi, et, prev);
-		return prev;
-	}
-	return NULL;
-}
-
-static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_node *en)
-{
-	struct extent_node *next;
-	struct rb_node *node;
-
-	node = rb_next(&en->rb_node);
-	if (!node)
-		return NULL;
-
-	next = rb_entry(node, struct extent_node, rb_node);
-	if (__is_front_mergeable(&en->ei, &next->ei)) {
-		en->ei.len += next->ei.len;
-		__detach_extent_node(sbi, et, next);
-		return next;
-	}
-	return NULL;
-}
-
-static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_info *ei,
-				struct extent_node **den)
-{
-	struct rb_node **p = &et->root.rb_node;
-	struct rb_node *parent = NULL;
-	struct extent_node *en;
-
-	while (*p) {
-		parent = *p;
-		en = rb_entry(parent, struct extent_node, rb_node);
-
-		if (ei->fofs < en->ei.fofs) {
-			if (__is_front_mergeable(ei, &en->ei)) {
-				f2fs_bug_on(sbi, !den);
-				en->ei.fofs = ei->fofs;
-				en->ei.blk = ei->blk;
-				en->ei.len += ei->len;
-				*den = __try_back_merge(sbi, et, en);
-				goto update_out;
-			}
-			p = &(*p)->rb_left;
-		} else if (ei->fofs >= en->ei.fofs + en->ei.len) {
-			if (__is_back_mergeable(ei, &en->ei)) {
-				f2fs_bug_on(sbi, !den);
-				en->ei.len += ei->len;
-				*den = __try_front_merge(sbi, et, en);
-				goto update_out;
-			}
-			p = &(*p)->rb_right;
-		} else {
-			f2fs_bug_on(sbi, 1);
-		}
-	}
-
-	en = __attach_extent_node(sbi, et, ei, parent, p);
-	if (!en)
-		return NULL;
-update_out:
-	if (en->ei.len > et->largest.len)
-		et->largest = en->ei;
-	et->cached_en = en;
-	return en;
-}
-
-static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
-					struct extent_tree *et, bool free_all)
-{
-	struct rb_node *node, *next;
-	struct extent_node *en;
-	unsigned int count = et->count;
-
-	node = rb_first(&et->root);
-	while (node) {
-		next = rb_next(node);
-		en = rb_entry(node, struct extent_node, rb_node);
-
-		if (free_all) {
-			spin_lock(&sbi->extent_lock);
-			if (!list_empty(&en->list))
-				list_del_init(&en->list);
-			spin_unlock(&sbi->extent_lock);
-		}
-
-		if (free_all || list_empty(&en->list)) {
-			__detach_extent_node(sbi, et, en);
-			kmem_cache_free(extent_node_slab, en);
-		}
-		node = next;
-	}
-
-	return count - et->count;
-}
-
-static void __drop_largest_extent(struct inode *inode, pgoff_t fofs)
-{
-	struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
-
-	if (largest->fofs <= fofs && largest->fofs + largest->len > fofs)
-		largest->len = 0;
-}
-
-void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et;
-	struct extent_node *en;
-	struct extent_info ei;
-
-	if (!f2fs_may_extent_tree(inode))
-		return;
-
-	et = __grab_extent_tree(inode);
-
-	if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
-		return;
-
-	set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
-		le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
-
-	write_lock(&et->lock);
-	if (et->count)
-		goto out;
-
-	en = __insert_extent_tree(sbi, et, &ei, NULL);
-	if (en) {
-		spin_lock(&sbi->extent_lock);
-		list_add_tail(&en->list, &sbi->extent_list);
-		spin_unlock(&sbi->extent_lock);
-	}
-out:
-	write_unlock(&et->lock);
-}
-
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
-							struct extent_info *ei)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
-	struct extent_node *en;
-	bool ret = false;
-
-	f2fs_bug_on(sbi, !et);
-
-	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
-
-	read_lock(&et->lock);
-
-	if (et->largest.fofs <= pgofs &&
-			et->largest.fofs + et->largest.len > pgofs) {
-		*ei = et->largest;
-		ret = true;
-		stat_inc_read_hit(sbi->sb);
-		goto out;
-	}
-
-	en = __lookup_extent_tree(et, pgofs);
-	if (en) {
-		*ei = en->ei;
-		spin_lock(&sbi->extent_lock);
-		if (!list_empty(&en->list))
-			list_move_tail(&en->list, &sbi->extent_list);
-		et->cached_en = en;
-		spin_unlock(&sbi->extent_lock);
-		ret = true;
-		stat_inc_read_hit(sbi->sb);
-	}
-out:
-	stat_inc_total_hit(sbi->sb);
-	read_unlock(&et->lock);
-
-	trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
-	return ret;
-}
-
-/* return true, if on-disk extent should be updated */
-static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
-							block_t blkaddr)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
-	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
-	struct extent_node *den = NULL;
-	struct extent_info ei, dei, prev;
-	unsigned int endofs;
-
-	if (!et)
-		return false;
-
-	trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
-
-	write_lock(&et->lock);
-
-	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
-		write_unlock(&et->lock);
-		return false;
-	}
-
-	prev = et->largest;
-	dei.len = 0;
-
-	/* we do not guarantee that the largest extent is cached all the time */
-	__drop_largest_extent(inode, fofs);
-
-	/* 1. lookup and remove existing extent info in cache */
-	en = __lookup_extent_tree(et, fofs);
-	if (!en)
-		goto update_extent;
-
-	dei = en->ei;
-	__detach_extent_node(sbi, et, en);
-
-	/* 2. if extent can be split more, split and insert the left part */
-	if (dei.len > F2FS_MIN_EXTENT_LEN) {
-		/*  insert left part of split extent into cache */
-		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
-			set_extent_info(&ei, dei.fofs, dei.blk,
-							fofs - dei.fofs);
-			en1 = __insert_extent_tree(sbi, et, &ei, NULL);
-		}
-
-		/* insert right part of split extent into cache */
-		endofs = dei.fofs + dei.len - 1;
-		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
-			set_extent_info(&ei, fofs + 1,
-				fofs - dei.fofs + dei.blk + 1, endofs - fofs);
-			en2 = __insert_extent_tree(sbi, et, &ei, NULL);
-		}
-	}
-
-update_extent:
-	/* 3. update extent in extent cache */
-	if (blkaddr) {
-		set_extent_info(&ei, fofs, blkaddr, 1);
-		en3 = __insert_extent_tree(sbi, et, &ei, &den);
-
-		/* give up extent_cache, if split and small updates happen */
-		if (dei.len >= 1 &&
-				prev.len < F2FS_MIN_EXTENT_LEN &&
-				et->largest.len < F2FS_MIN_EXTENT_LEN) {
-			et->largest.len = 0;
-			set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
-		}
-	}
-
-	/* 4. update in global extent list */
-	spin_lock(&sbi->extent_lock);
-	if (en && !list_empty(&en->list))
-		list_del(&en->list);
-	/*
-	 * en1 and en2 split from en, they will become more and more smaller
-	 * fragments after splitting several times. So if the length is smaller
-	 * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
-	 */
-	if (en1)
-		list_add_tail(&en1->list, &sbi->extent_list);
-	if (en2)
-		list_add_tail(&en2->list, &sbi->extent_list);
-	if (en3) {
-		if (list_empty(&en3->list))
-			list_add_tail(&en3->list, &sbi->extent_list);
-		else
-			list_move_tail(&en3->list, &sbi->extent_list);
-	}
-	if (den && !list_empty(&den->list))
-		list_del(&den->list);
-	spin_unlock(&sbi->extent_lock);
-
-	/* 5. release extent node */
-	if (en)
-		kmem_cache_free(extent_node_slab, en);
-	if (den)
-		kmem_cache_free(extent_node_slab, den);
-
-	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
-		__free_extent_tree(sbi, et, true);
-
-	write_unlock(&et->lock);
-
-	return !__is_extent_same(&prev, &et->largest);
-}
-
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
-{
-	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
-	struct extent_node *en, *tmp;
-	unsigned long ino = F2FS_ROOT_INO(sbi);
-	struct radix_tree_root *root = &sbi->extent_tree_root;
-	unsigned int found;
-	unsigned int node_cnt = 0, tree_cnt = 0;
-	int remained;
-
-	if (!test_opt(sbi, EXTENT_CACHE))
-		return 0;
-
-	if (!down_write_trylock(&sbi->extent_tree_lock))
-		goto out;
-
-	/* 1. remove unreferenced extent tree */
-	while ((found = radix_tree_gang_lookup(root,
-				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
-		unsigned i;
-
-		ino = treevec[found - 1]->ino + 1;
-		for (i = 0; i < found; i++) {
-			struct extent_tree *et = treevec[i];
-
-			if (!atomic_read(&et->refcount)) {
-				write_lock(&et->lock);
-				node_cnt += __free_extent_tree(sbi, et, true);
-				write_unlock(&et->lock);
-
-				radix_tree_delete(root, et->ino);
-				kmem_cache_free(extent_tree_slab, et);
-				sbi->total_ext_tree--;
-				tree_cnt++;
-
-				if (node_cnt + tree_cnt >= nr_shrink)
-					goto unlock_out;
-			}
-		}
-	}
-	up_write(&sbi->extent_tree_lock);
-
-	/* 2. remove LRU extent entries */
-	if (!down_write_trylock(&sbi->extent_tree_lock))
-		goto out;
-
-	remained = nr_shrink - (node_cnt + tree_cnt);
-
-	spin_lock(&sbi->extent_lock);
-	list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
-		if (!remained--)
-			break;
-		list_del_init(&en->list);
-	}
-	spin_unlock(&sbi->extent_lock);
-
-	while ((found = radix_tree_gang_lookup(root,
-				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
-		unsigned i;
-
-		ino = treevec[found - 1]->ino + 1;
-		for (i = 0; i < found; i++) {
-			struct extent_tree *et = treevec[i];
-
-			write_lock(&et->lock);
-			node_cnt += __free_extent_tree(sbi, et, false);
-			write_unlock(&et->lock);
-
-			if (node_cnt + tree_cnt >= nr_shrink)
-				break;
-		}
-	}
-unlock_out:
-	up_write(&sbi->extent_tree_lock);
-out:
-	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
-
-	return node_cnt + tree_cnt;
-}
-
-unsigned int f2fs_destroy_extent_node(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
-	unsigned int node_cnt = 0;
-
-	if (!et)
-		return 0;
-
-	write_lock(&et->lock);
-	node_cnt = __free_extent_tree(sbi, et, true);
-	write_unlock(&et->lock);
-
-	return node_cnt;
-}
-
-void f2fs_destroy_extent_tree(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
-	unsigned int node_cnt = 0;
-
-	if (!et)
-		return;
-
-	if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
-		atomic_dec(&et->refcount);
-		return;
-	}
-
-	/* free all extent info belong to this extent tree */
-	node_cnt = f2fs_destroy_extent_node(inode);
-
-	/* delete extent tree entry in radix tree */
-	down_write(&sbi->extent_tree_lock);
-	atomic_dec(&et->refcount);
-	f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
-	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
-	kmem_cache_free(extent_tree_slab, et);
-	sbi->total_ext_tree--;
-	up_write(&sbi->extent_tree_lock);
-
-	F2FS_I(inode)->extent_tree = NULL;
-
-	trace_f2fs_destroy_extent_tree(inode, node_cnt);
-	return;
-}
-
-static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
-							struct extent_info *ei)
-{
-	if (!f2fs_may_extent_tree(inode))
-		return false;
-
-	return f2fs_lookup_extent_tree(inode, pgofs, ei);
-}
-
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
-{
-	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
-	pgoff_t fofs;
-
-	if (!f2fs_may_extent_tree(dn->inode))
-		return;
-
-	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-
-	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
-							dn->ofs_in_node;
-
-	if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr))
-		sync_inode_page(dn);
-}
-
 struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
 {
 	struct address_space *mapping = inode->i_mapping;
@@ -1017,7 +472,7 @@ alloc:
 		i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
 
 	/* direct IO doesn't use extent cache to maximize the performance */
-	__drop_largest_extent(dn->inode, fofs);
+	f2fs_drop_largest_extent(dn->inode, fofs);
 
 	return 0;
 }
@@ -1997,37 +1452,6 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, get_data_block);
 }
 
-void init_extent_cache_info(struct f2fs_sb_info *sbi)
-{
-	INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
-	init_rwsem(&sbi->extent_tree_lock);
-	INIT_LIST_HEAD(&sbi->extent_list);
-	spin_lock_init(&sbi->extent_lock);
-	sbi->total_ext_tree = 0;
-	atomic_set(&sbi->total_ext_node, 0);
-}
-
-int __init create_extent_cache(void)
-{
-	extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
-			sizeof(struct extent_tree));
-	if (!extent_tree_slab)
-		return -ENOMEM;
-	extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
-			sizeof(struct extent_node));
-	if (!extent_node_slab) {
-		kmem_cache_destroy(extent_tree_slab);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-void destroy_extent_cache(void)
-{
-	kmem_cache_destroy(extent_node_slab);
-	kmem_cache_destroy(extent_tree_slab);
-}
-
 const struct address_space_operations f2fs_dblock_aops = {
 	.readpage	= f2fs_read_data_page,
 	.readpages	= f2fs_read_data_pages,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
new file mode 100644
index 000000000000..5f78fc1e818a
--- /dev/null
+++ b/fs/f2fs/extent_cache.c
@@ -0,0 +1,594 @@
+/*
+ * f2fs extent cache support
+ *
+ * Copyright (c) 2015 Motorola Mobility
+ * Copyright (c) 2015 Samsung Electronics
+ * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
+ *          Chao Yu <chao2.yu@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include <trace/events/f2fs.h>
+
+static struct kmem_cache *extent_tree_slab;
+static struct kmem_cache *extent_node_slab;
+
+static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei,
+				struct rb_node *parent, struct rb_node **p)
+{
+	struct extent_node *en;
+
+	en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+	if (!en)
+		return NULL;
+
+	en->ei = *ei;
+	INIT_LIST_HEAD(&en->list);
+
+	rb_link_node(&en->rb_node, parent, p);
+	rb_insert_color(&en->rb_node, &et->root);
+	et->count++;
+	atomic_inc(&sbi->total_ext_node);
+	return en;
+}
+
+static void __detach_extent_node(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_node *en)
+{
+	rb_erase(&en->rb_node, &et->root);
+	et->count--;
+	atomic_dec(&sbi->total_ext_node);
+
+	if (et->cached_en == en)
+		et->cached_en = NULL;
+}
+
+static struct extent_tree *__grab_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	nid_t ino = inode->i_ino;
+
+	down_write(&sbi->extent_tree_lock);
+	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+	if (!et) {
+		et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+		memset(et, 0, sizeof(struct extent_tree));
+		et->ino = ino;
+		et->root = RB_ROOT;
+		et->cached_en = NULL;
+		rwlock_init(&et->lock);
+		atomic_set(&et->refcount, 0);
+		et->count = 0;
+		sbi->total_ext_tree++;
+	}
+	atomic_inc(&et->refcount);
+	up_write(&sbi->extent_tree_lock);
+
+	/* never died until evict_inode */
+	F2FS_I(inode)->extent_tree = et;
+
+	return et;
+}
+
+static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
+							unsigned int fofs)
+{
+	struct rb_node *node = et->root.rb_node;
+	struct extent_node *en;
+
+	if (et->cached_en) {
+		struct extent_info *cei = &et->cached_en->ei;
+
+		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+			return et->cached_en;
+	}
+
+	while (node) {
+		en = rb_entry(node, struct extent_node, rb_node);
+
+		if (fofs < en->ei.fofs)
+			node = node->rb_left;
+		else if (fofs >= en->ei.fofs + en->ei.len)
+			node = node->rb_right;
+		else
+			return en;
+	}
+	return NULL;
+}
+
+static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_node *en)
+{
+	struct extent_node *prev;
+	struct rb_node *node;
+
+	node = rb_prev(&en->rb_node);
+	if (!node)
+		return NULL;
+
+	prev = rb_entry(node, struct extent_node, rb_node);
+	if (__is_back_mergeable(&en->ei, &prev->ei)) {
+		en->ei.fofs = prev->ei.fofs;
+		en->ei.blk = prev->ei.blk;
+		en->ei.len += prev->ei.len;
+		__detach_extent_node(sbi, et, prev);
+		return prev;
+	}
+	return NULL;
+}
+
+static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_node *en)
+{
+	struct extent_node *next;
+	struct rb_node *node;
+
+	node = rb_next(&en->rb_node);
+	if (!node)
+		return NULL;
+
+	next = rb_entry(node, struct extent_node, rb_node);
+	if (__is_front_mergeable(&en->ei, &next->ei)) {
+		en->ei.len += next->ei.len;
+		__detach_extent_node(sbi, et, next);
+		return next;
+	}
+	return NULL;
+}
+
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei,
+				struct extent_node **den)
+{
+	struct rb_node **p = &et->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_node *en;
+
+	while (*p) {
+		parent = *p;
+		en = rb_entry(parent, struct extent_node, rb_node);
+
+		if (ei->fofs < en->ei.fofs) {
+			if (__is_front_mergeable(ei, &en->ei)) {
+				f2fs_bug_on(sbi, !den);
+				en->ei.fofs = ei->fofs;
+				en->ei.blk = ei->blk;
+				en->ei.len += ei->len;
+				*den = __try_back_merge(sbi, et, en);
+				goto update_out;
+			}
+			p = &(*p)->rb_left;
+		} else if (ei->fofs >= en->ei.fofs + en->ei.len) {
+			if (__is_back_mergeable(ei, &en->ei)) {
+				f2fs_bug_on(sbi, !den);
+				en->ei.len += ei->len;
+				*den = __try_front_merge(sbi, et, en);
+				goto update_out;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			f2fs_bug_on(sbi, 1);
+		}
+	}
+
+	en = __attach_extent_node(sbi, et, ei, parent, p);
+	if (!en)
+		return NULL;
+update_out:
+	if (en->ei.len > et->largest.len)
+		et->largest = en->ei;
+	et->cached_en = en;
+	return en;
+}
+
+static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
+					struct extent_tree *et, bool free_all)
+{
+	struct rb_node *node, *next;
+	struct extent_node *en;
+	unsigned int count = et->count;
+
+	node = rb_first(&et->root);
+	while (node) {
+		next = rb_next(node);
+		en = rb_entry(node, struct extent_node, rb_node);
+
+		if (free_all) {
+			spin_lock(&sbi->extent_lock);
+			if (!list_empty(&en->list))
+				list_del_init(&en->list);
+			spin_unlock(&sbi->extent_lock);
+		}
+
+		if (free_all || list_empty(&en->list)) {
+			__detach_extent_node(sbi, et, en);
+			kmem_cache_free(extent_node_slab, en);
+		}
+		node = next;
+	}
+
+	return count - et->count;
+}
+
+void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
+{
+	struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
+
+	if (largest->fofs <= fofs && largest->fofs + largest->len > fofs)
+		largest->len = 0;
+}
+
+void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et;
+	struct extent_node *en;
+	struct extent_info ei;
+
+	if (!f2fs_may_extent_tree(inode))
+		return;
+
+	et = __grab_extent_tree(inode);
+
+	if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
+		return;
+
+	set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
+		le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+
+	write_lock(&et->lock);
+	if (et->count)
+		goto out;
+
+	en = __insert_extent_tree(sbi, et, &ei, NULL);
+	if (en) {
+		spin_lock(&sbi->extent_lock);
+		list_add_tail(&en->list, &sbi->extent_list);
+		spin_unlock(&sbi->extent_lock);
+	}
+out:
+	write_unlock(&et->lock);
+}
+
+static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+							struct extent_info *ei)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_node *en;
+	bool ret = false;
+
+	f2fs_bug_on(sbi, !et);
+
+	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+
+	read_lock(&et->lock);
+
+	if (et->largest.fofs <= pgofs &&
+			et->largest.fofs + et->largest.len > pgofs) {
+		*ei = et->largest;
+		ret = true;
+		stat_inc_read_hit(sbi->sb);
+		goto out;
+	}
+
+	en = __lookup_extent_tree(et, pgofs);
+	if (en) {
+		*ei = en->ei;
+		spin_lock(&sbi->extent_lock);
+		if (!list_empty(&en->list))
+			list_move_tail(&en->list, &sbi->extent_list);
+		et->cached_en = en;
+		spin_unlock(&sbi->extent_lock);
+		ret = true;
+		stat_inc_read_hit(sbi->sb);
+	}
+out:
+	stat_inc_total_hit(sbi->sb);
+	read_unlock(&et->lock);
+
+	trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+	return ret;
+}
+
+/* return true, if on-disk extent should be updated */
+static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
+							block_t blkaddr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+	struct extent_node *den = NULL;
+	struct extent_info ei, dei, prev;
+	unsigned int endofs;
+
+	if (!et)
+		return false;
+
+	trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
+
+	write_lock(&et->lock);
+
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
+		write_unlock(&et->lock);
+		return false;
+	}
+
+	prev = et->largest;
+	dei.len = 0;
+
+	/* we do not guarantee that the largest extent is cached all the time */
+	f2fs_drop_largest_extent(inode, fofs);
+
+	/* 1. lookup and remove existing extent info in cache */
+	en = __lookup_extent_tree(et, fofs);
+	if (!en)
+		goto update_extent;
+
+	dei = en->ei;
+	__detach_extent_node(sbi, et, en);
+
+	/* 2. if extent can be split more, split and insert the left part */
+	if (dei.len > F2FS_MIN_EXTENT_LEN) {
+		/*  insert left part of split extent into cache */
+		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+			set_extent_info(&ei, dei.fofs, dei.blk,
+							fofs - dei.fofs);
+			en1 = __insert_extent_tree(sbi, et, &ei, NULL);
+		}
+
+		/* insert right part of split extent into cache */
+		endofs = dei.fofs + dei.len - 1;
+		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
+			set_extent_info(&ei, fofs + 1,
+				fofs - dei.fofs + dei.blk + 1, endofs - fofs);
+			en2 = __insert_extent_tree(sbi, et, &ei, NULL);
+		}
+	}
+
+update_extent:
+	/* 3. update extent in extent cache */
+	if (blkaddr) {
+		set_extent_info(&ei, fofs, blkaddr, 1);
+		en3 = __insert_extent_tree(sbi, et, &ei, &den);
+
+		/* give up extent_cache, if split and small updates happen */
+		if (dei.len >= 1 &&
+				prev.len < F2FS_MIN_EXTENT_LEN &&
+				et->largest.len < F2FS_MIN_EXTENT_LEN) {
+			et->largest.len = 0;
+			set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
+		}
+	}
+
+	/* 4. update in global extent list */
+	spin_lock(&sbi->extent_lock);
+	if (en && !list_empty(&en->list))
+		list_del(&en->list);
+	/*
+	 * en1 and en2 split from en, they will become more and more smaller
+	 * fragments after splitting several times. So if the length is smaller
+	 * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
+	 */
+	if (en1)
+		list_add_tail(&en1->list, &sbi->extent_list);
+	if (en2)
+		list_add_tail(&en2->list, &sbi->extent_list);
+	if (en3) {
+		if (list_empty(&en3->list))
+			list_add_tail(&en3->list, &sbi->extent_list);
+		else
+			list_move_tail(&en3->list, &sbi->extent_list);
+	}
+	if (den && !list_empty(&den->list))
+		list_del(&den->list);
+	spin_unlock(&sbi->extent_lock);
+
+	/* 5. release extent node */
+	if (en)
+		kmem_cache_free(extent_node_slab, en);
+	if (den)
+		kmem_cache_free(extent_node_slab, den);
+
+	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+		__free_extent_tree(sbi, et, true);
+
+	write_unlock(&et->lock);
+
+	return !__is_extent_same(&prev, &et->largest);
+}
+
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+	struct extent_node *en, *tmp;
+	unsigned long ino = F2FS_ROOT_INO(sbi);
+	struct radix_tree_root *root = &sbi->extent_tree_root;
+	unsigned int found;
+	unsigned int node_cnt = 0, tree_cnt = 0;
+	int remained;
+
+	if (!test_opt(sbi, EXTENT_CACHE))
+		return 0;
+
+	if (!down_write_trylock(&sbi->extent_tree_lock))
+		goto out;
+
+	/* 1. remove unreferenced extent tree */
+	while ((found = radix_tree_gang_lookup(root,
+				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+		unsigned i;
+
+		ino = treevec[found - 1]->ino + 1;
+		for (i = 0; i < found; i++) {
+			struct extent_tree *et = treevec[i];
+
+			if (!atomic_read(&et->refcount)) {
+				write_lock(&et->lock);
+				node_cnt += __free_extent_tree(sbi, et, true);
+				write_unlock(&et->lock);
+
+				radix_tree_delete(root, et->ino);
+				kmem_cache_free(extent_tree_slab, et);
+				sbi->total_ext_tree--;
+				tree_cnt++;
+
+				if (node_cnt + tree_cnt >= nr_shrink)
+					goto unlock_out;
+			}
+		}
+	}
+	up_write(&sbi->extent_tree_lock);
+
+	/* 2. remove LRU extent entries */
+	if (!down_write_trylock(&sbi->extent_tree_lock))
+		goto out;
+
+	remained = nr_shrink - (node_cnt + tree_cnt);
+
+	spin_lock(&sbi->extent_lock);
+	list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
+		if (!remained--)
+			break;
+		list_del_init(&en->list);
+	}
+	spin_unlock(&sbi->extent_lock);
+
+	while ((found = radix_tree_gang_lookup(root,
+				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+		unsigned i;
+
+		ino = treevec[found - 1]->ino + 1;
+		for (i = 0; i < found; i++) {
+			struct extent_tree *et = treevec[i];
+
+			write_lock(&et->lock);
+			node_cnt += __free_extent_tree(sbi, et, false);
+			write_unlock(&et->lock);
+
+			if (node_cnt + tree_cnt >= nr_shrink)
+				break;
+		}
+	}
+unlock_out:
+	up_write(&sbi->extent_tree_lock);
+out:
+	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+
+	return node_cnt + tree_cnt;
+}
+
+unsigned int f2fs_destroy_extent_node(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	unsigned int node_cnt = 0;
+
+	if (!et)
+		return 0;
+
+	write_lock(&et->lock);
+	node_cnt = __free_extent_tree(sbi, et, true);
+	write_unlock(&et->lock);
+
+	return node_cnt;
+}
+
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	unsigned int node_cnt = 0;
+
+	if (!et)
+		return;
+
+	if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
+		atomic_dec(&et->refcount);
+		return;
+	}
+
+	/* free all extent info belong to this extent tree */
+	node_cnt = f2fs_destroy_extent_node(inode);
+
+	/* delete extent tree entry in radix tree */
+	down_write(&sbi->extent_tree_lock);
+	atomic_dec(&et->refcount);
+	f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+	kmem_cache_free(extent_tree_slab, et);
+	sbi->total_ext_tree--;
+	up_write(&sbi->extent_tree_lock);
+
+	F2FS_I(inode)->extent_tree = NULL;
+
+	trace_f2fs_destroy_extent_tree(inode, node_cnt);
+}
+
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+					struct extent_info *ei)
+{
+	if (!f2fs_may_extent_tree(inode))
+		return false;
+
+	return f2fs_lookup_extent_tree(inode, pgofs, ei);
+}
+
+void f2fs_update_extent_cache(struct dnode_of_data *dn)
+{
+	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+	pgoff_t fofs;
+
+	if (!f2fs_may_extent_tree(dn->inode))
+		return;
+
+	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+							dn->ofs_in_node;
+
+	if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr))
+		sync_inode_page(dn);
+}
+
+void init_extent_cache_info(struct f2fs_sb_info *sbi)
+{
+	INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
+	init_rwsem(&sbi->extent_tree_lock);
+	INIT_LIST_HEAD(&sbi->extent_list);
+	spin_lock_init(&sbi->extent_lock);
+	sbi->total_ext_tree = 0;
+	atomic_set(&sbi->total_ext_node, 0);
+}
+
+int __init create_extent_cache(void)
+{
+	extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
+			sizeof(struct extent_tree));
+	if (!extent_tree_slab)
+		return -ENOMEM;
+	extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
+			sizeof(struct extent_node));
+	if (!extent_node_slab) {
+		kmem_cache_destroy(extent_tree_slab);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void destroy_extent_cache(void)
+{
+	kmem_cache_destroy(extent_node_slab);
+	kmem_cache_destroy(extent_tree_slab);
+}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1e6f54d8b464..88b05cba3d4a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1766,20 +1766,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *);
 void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
-unsigned int f2fs_destroy_extent_node(struct inode *);
-void f2fs_destroy_extent_tree(struct inode *);
-void f2fs_update_extent_cache(struct dnode_of_data *);
 struct page *get_read_data_page(struct inode *, pgoff_t, int);
 struct page *find_data_page(struct inode *, pgoff_t);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
-void init_extent_cache_info(struct f2fs_sb_info *);
-int __init create_extent_cache(void);
-void destroy_extent_cache(void);
 void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
 int f2fs_release_page(struct page *, gfp_t);
 
@@ -1976,6 +1968,20 @@ unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *);
 void f2fs_join_shrinker(struct f2fs_sb_info *);
 void f2fs_leave_shrinker(struct f2fs_sb_info *);
 
+/*
+ * extent_cache.c
+ */
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+void f2fs_drop_largest_extent(struct inode *, pgoff_t);
+void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+unsigned int f2fs_destroy_extent_node(struct inode *);
+void f2fs_destroy_extent_tree(struct inode *);
+bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
+void f2fs_update_extent_cache(struct dnode_of_data *);
+void init_extent_cache_info(struct f2fs_sb_info *);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
+
 /*
  * crypto support
  */
-- 
cgit v1.2.3


From c1c1b58359d45e1a9f236ce5a40d50720c07c70e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 10 Jul 2015 18:08:10 +0800
Subject: f2fs: add new ioctl F2FS_IOC_GARBAGE_COLLECT

When background gc is off, the only way to trigger gc is executing
a force gc in some operations who wants to grab space in disk.

The executing condition is limited: to execute force gc, we should
wait for the time when there is almost no more free section for LFS
allocation. This seems not reasonable for our user who wants to
control triggering gc by himself.

This patch introduces F2FS_IOC_GARBAGE_COLLECT interface for
triggering garbage collection by using ioctl. It provides our users
one more option to trigger gc.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h |  1 +
 fs/f2fs/file.c | 32 ++++++++++++++++++++++++++++++++
 fs/f2fs/gc.h   |  6 ++++++
 3 files changed, 39 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 88b05cba3d4a..673623b36901 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -228,6 +228,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_START_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 3)
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
+#define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
 		_IOR('f', 19, struct f2fs_encryption_policy)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index fe8398f1d627..dcc01137fca0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -27,6 +27,7 @@
 #include "segment.h"
 #include "xattr.h"
 #include "acl.h"
+#include "gc.h"
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
@@ -1558,6 +1559,35 @@ got_it:
 	return 0;
 }
 
+static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	__u32 i, count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(count, (__u32 __user *)arg))
+		return -EFAULT;
+
+	if (!count || count > F2FS_BATCH_GC_MAX_NUM)
+		return -EINVAL;
+
+	for (i = 0; i < count; i++) {
+		if (!mutex_trylock(&sbi->gc_mutex))
+			break;
+
+		if (f2fs_gc(sbi))
+			break;
+	}
+
+	if (put_user(i, (__u32 __user *)arg))
+		return -EFAULT;
+
+	return 0;
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1587,6 +1617,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_get_encryption_policy(filp, arg);
 	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
 		return f2fs_ioc_get_encryption_pwsalt(filp, arg);
+	case F2FS_IOC_GARBAGE_COLLECT:
+		return f2fs_ioc_gc(filp, arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b4a65be9f7d3..c5a055b3376e 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -19,6 +19,12 @@
 #define LIMIT_INVALID_BLOCK	40 /* percentage over total user space */
 #define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */
 
+/*
+ * with this macro, we can control the max time we do garbage collection,
+ * when user triggers batch mode gc by ioctl.
+ */
+#define F2FS_BATCH_GC_MAX_NUM		16
+
 /* Search max. number of dirty segments to select a victim segment */
 #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
 
-- 
cgit v1.2.3


From 5b3391244d1c89bb4c8e1b4e4916fb4965fb71f9 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 13 Jul 2015 17:43:19 +0800
Subject: f2fs: warm up cold page after mmaped write

With cost-benifit method, background gc will consider old section with
fewer valid blocks as candidate victim, these old blocks in section will
be treated as cold data, and laterly will be moved into cold segment.

But if the gcing page is attached by user through buffered or mmaped
write, we should reset the page as non-cold one, because this page may
have more opportunity for further updating.

So fix to add clearing code for the missed 'mmap' case.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index dcc01137fca0..9c40f8cfb77c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -86,6 +86,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 mapped:
 	/* fill the page */
 	f2fs_wait_on_page_writeback(page, DATA);
+	/* if gced page is attached, don't write to cold segment */
+	clear_cold_data(page);
 out:
 	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(err);
-- 
cgit v1.2.3


From bd936f840779366b61300c0f4f752dd1b52b1ca3 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 13 Jul 2015 17:44:25 +0800
Subject: f2fs: cleanup write_orphan_inodes

Previously, since 'commit 4531929e3922 ("f2fs: move grabing orphan
pages out of protection region")' was committed, in write_orphan_inodes(),
we will grab all meta page in a batch before we use them under spinlock,
so that we can avoid large time delay of grabbing meta pages under
spinlock.

Now, 'commit d6c67a4fee86 ("f2fs: revmove spin_lock for
write_orphan_inodes")' remove the spinlock in write_orphan_inodes,
so there is no issue we describe above, we'd better recover to move
the grab operation to original place for readability.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index de7a0d6a371a..60327027137f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -504,7 +504,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	struct list_head *head;
 	struct f2fs_orphan_block *orphan_blk = NULL;
 	unsigned int nentries = 0;
-	unsigned short index;
+	unsigned short index = 1;
 	unsigned short orphan_blocks;
 	struct page *page = NULL;
 	struct ino_entry *orphan = NULL;
@@ -512,11 +512,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 
 	orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
 
-	for (index = 0; index < orphan_blocks; index++)
-		grab_meta_page(sbi, start_blk + index);
-
-	index = 1;
-
 	/*
 	 * we don't need to do spin_lock(&im->ino_lock) here, since all the
 	 * orphan inode operations are covered under f2fs_lock_op().
@@ -527,12 +522,10 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	/* loop for each orphan inode entry and write them in Jornal block */
 	list_for_each_entry(orphan, head, list) {
 		if (!page) {
-			page = find_get_page(META_MAPPING(sbi), start_blk++);
-			f2fs_bug_on(sbi, !page);
+			page = grab_meta_page(sbi, start_blk++);
 			orphan_blk =
 				(struct f2fs_orphan_block *)page_address(page);
 			memset(orphan_blk, 0, sizeof(*orphan_blk));
-			f2fs_put_page(page, 0);
 		}
 
 		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
-- 
cgit v1.2.3


From 037fe70c9a6cebe11ae13402994b844e907ebe0c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 13 Jul 2015 17:45:19 +0800
Subject: f2fs: correct return value of ->setxattr

This patch fixes to return correct error number of ->setxattr, which
is reported by xfstest tests/generic/026 as below:

generic/026      - output mismatch
    --- tests/generic/026.out
    +++ results/generic/026.out.bad
    @@ -4,6 +4,6 @@
     1 below acl max
     acl max
     1 above acl max
    -chacl: cannot set access acl on "largeaclfile": Argument list too long
    +chacl: cannot set access acl on "largeaclfile": Numerical result out of range
     use 16 aces
     use 17 aces
    ...
Ran: generic/026
Failures: generic/026
Failed 1 of 1 tests

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/xattr.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 07449b980acb..4de2286c0e4d 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -499,9 +499,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 
 	len = strlen(name);
 
-	if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode))
+	if (len > F2FS_NAME_LEN)
 		return -ERANGE;
 
+	if (size > MAX_VALUE_LEN(inode))
+		return -E2BIG;
+
 	base_addr = read_all_xattrs(inode, ipage);
 	if (!base_addr)
 		goto exit;
-- 
cgit v1.2.3


From 8f46dcaea8d9d1552f4071f1ddeeca4427c1d83a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 14 Jul 2015 18:56:10 +0800
Subject: f2fs: expose f2fs_write_cache_pages

If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.

This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.

In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.

By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.

Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).

Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.

before:
real	0m0.402s
user	0m0.000s
sys	0m0.000s

after:
real	0m0.143s
user	0m0.004s
sys	0m0.004s

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 135 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ef30b59756c6..e58562e70da0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -14,6 +14,7 @@
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/prefetch.h>
@@ -1127,6 +1128,139 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
 	return ret;
 }
 
+/*
+ * This function was copied from write_cche_pages from mm/page-writeback.c.
+ * The major change is making write step of cold data page separately from
+ * warm/hot data page.
+ */
+static int f2fs_write_cache_pages(struct address_space *mapping,
+			struct writeback_control *wbc, writepage_t writepage,
+			void *data)
+{
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t uninitialized_var(writeback_index);
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	pgoff_t done_index;
+	int cycled;
+	int range_whole = 0;
+	int tag;
+	int step = 0;
+
+	pagevec_init(&pvec, 0);
+next:
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index; /* prev offset */
+		index = writeback_index;
+		if (index == 0)
+			cycled = 1;
+		else
+			cycled = 0;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		cycled = 1; /* ignore range_cyclic tests */
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, index, end);
+	done_index = index;
+	while (!done && (index <= end)) {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page->index > end) {
+				done = 1;
+				break;
+			}
+
+			done_index = page->index;
+
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (step == 0 && !is_cold_data(page))
+				goto continue_unlock;
+			if (step == 1 && is_cold_data(page))
+				goto continue_unlock;
+
+			if (PageWriteback(page)) {
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					f2fs_wait_on_page_writeback(page, DATA);
+				else
+					goto continue_unlock;
+			}
+
+			BUG_ON(PageWriteback(page));
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			ret = (*writepage)(page, wbc, data);
+			if (unlikely(ret)) {
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+				} else {
+					done_index = page->index + 1;
+					done = 1;
+					break;
+				}
+			}
+
+			if (--wbc->nr_to_write <= 0 &&
+			    wbc->sync_mode == WB_SYNC_NONE) {
+				done = 1;
+				break;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (step < 1) {
+		step++;
+		goto next;
+	}
+
+	if (!cycled && !done) {
+		cycled = 1;
+		index = 0;
+		end = writeback_index - 1;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = done_index;
+
+	return ret;
+}
+
 static int f2fs_write_data_pages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
@@ -1157,7 +1291,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 		mutex_lock(&sbi->writepages);
 		locked = true;
 	}
-	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+	ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
 	f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	if (locked)
 		mutex_unlock(&sbi->writepages);
-- 
cgit v1.2.3


From 1b77c416e7dfe317277057c32baa67ea9e486ae7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 13 Jul 2015 18:31:24 -0700
Subject: f2fs: use a page temporarily for encrypted gced page

That encrypted page is used temporarily, so we don't need to mark it accessed.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2701e05af991..fcb263af58b3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -552,7 +552,10 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	fio.page = page;
 	fio.blk_addr = dn.data_blkaddr;
 
-	fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr);
+	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
+					fio.blk_addr,
+					FGP_LOCK|FGP_CREAT,
+					GFP_NOFS);
 	if (!fio.encrypted_page)
 		goto put_out;
 
-- 
cgit v1.2.3


From d5e8f6c9800c382cc55d8df801775d51311f8f21 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 15 Jul 2015 17:28:53 +0800
Subject: f2fs: stat inline xattr inode number

This patch adds to stat the number of inline xattr inode for
showing in debugfs.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c |  4 ++++
 fs/f2fs/f2fs.h  | 16 +++++++++++++++-
 fs/f2fs/inode.c |  2 ++
 fs/f2fs/namei.c |  1 +
 4 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 75176e0dd6c8..2aeaf4e214db 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -49,6 +49,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->valid_count = valid_user_blocks(sbi);
 	si->valid_node_count = valid_node_count(sbi);
 	si->valid_inode_count = valid_inode_count(sbi);
+	si->inline_xattr = atomic_read(&sbi->inline_xattr);
 	si->inline_inode = atomic_read(&sbi->inline_inode);
 	si->inline_dir = atomic_read(&sbi->inline_dir);
 	si->utilization = utilization(sbi);
@@ -226,6 +227,8 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "Other: %u)\n  - Data: %u\n",
 			   si->valid_node_count - si->valid_inode_count,
 			   si->valid_count - si->valid_node_count);
+		seq_printf(s, "  - Inline_xattr Inode: %u\n",
+			   si->inline_xattr);
 		seq_printf(s, "  - Inline_data Inode: %u\n",
 			   si->inline_inode);
 		seq_printf(s, "  - Inline_dentry Inode: %u\n",
@@ -366,6 +369,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	si->sbi = sbi;
 	sbi->stat_info = si;
 
+	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
 	atomic_set(&sbi->inline_dir, 0);
 	atomic_set(&sbi->inplace_count, 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 673623b36901..b18b85267711 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -782,6 +782,7 @@ struct f2fs_sb_info {
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
 	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
+	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
 	int bg_gc;				/* background gc calls */
@@ -1804,7 +1805,8 @@ struct f2fs_stat_info {
 	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
 	int nats, dirty_nats, sits, dirty_sits, fnids;
 	int total_count, utilization;
-	int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
+	int bg_gc, inmem_pages, wb_pages;
+	int inline_xattr, inline_inode, inline_dir;
 	unsigned int valid_count, valid_node_count, valid_inode_count;
 	unsigned int bimodal, avg_vblocks;
 	int util_free, util_valid, util_invalid;
@@ -1837,6 +1839,16 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
 #define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_inline_xattr(inode)					\
+	do {								\
+		if (f2fs_has_inline_xattr(inode))			\
+			(atomic_inc(&F2FS_I_SB(inode)->inline_xattr));	\
+	} while (0)
+#define stat_dec_inline_xattr(inode)					\
+	do {								\
+		if (f2fs_has_inline_xattr(inode))			\
+			(atomic_dec(&F2FS_I_SB(inode)->inline_xattr));	\
+	} while (0)
 #define stat_inc_inline_inode(inode)					\
 	do {								\
 		if (f2fs_has_inline_data(inode))			\
@@ -1907,6 +1919,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
 #define stat_inc_read_hit(sb)
+#define stat_inc_inline_xattr(inode)
+#define stat_dec_inline_xattr(inode)
 #define stat_inc_inline_inode(inode)
 #define stat_dec_inline_inode(inode)
 #define stat_inc_inline_dir(inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 978a7261a791..5b7547f0bdea 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -155,6 +155,7 @@ static int do_read_inode(struct inode *inode)
 
 	f2fs_put_page(node_page, 1);
 
+	stat_inc_inline_xattr(inode);
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
 
@@ -350,6 +351,7 @@ void f2fs_evict_inode(struct inode *inode)
 
 	sb_end_intwrite(inode->i_sb);
 no_delete:
+	stat_dec_inline_xattr(inode);
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 1856d5ecd809..97e97c41b979 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -67,6 +67,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	f2fs_init_extent_tree(inode, NULL);
 
+	stat_inc_inline_xattr(inode);
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
 
-- 
cgit v1.2.3


From 727edac572034557d207b293a47de25145e3d58c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 15 Jul 2015 17:29:49 +0800
Subject: f2fs: use atomic_t to record hit ratio info of extent cache

Variables for recording extent cache ratio info were updated without
protection, this patch tries to alter them to atomic_t type for more
accurate stat.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c        | 7 +++++--
 fs/f2fs/extent_cache.c | 6 +++---
 fs/f2fs/f2fs.h         | 7 ++++---
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 2aeaf4e214db..bc215fd6c402 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,8 +33,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	int i;
 
 	/* validation check of the segment numbers */
-	si->hit_ext = sbi->read_hit_ext;
-	si->total_ext = sbi->total_hit_ext;
+	si->hit_ext = atomic_read(&sbi->read_hit_ext);
+	si->total_ext = atomic_read(&sbi->total_hit_ext);
 	si->ext_tree = sbi->total_ext_tree;
 	si->ext_node = atomic_read(&sbi->total_ext_node);
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -369,6 +369,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	si->sbi = sbi;
 	sbi->stat_info = si;
 
+	atomic_set(&sbi->total_hit_ext, 0);
+	atomic_set(&sbi->read_hit_ext, 0);
+
 	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
 	atomic_set(&sbi->inline_dir, 0);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 5f78fc1e818a..362df8cd54d4 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -279,7 +279,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 			et->largest.fofs + et->largest.len > pgofs) {
 		*ei = et->largest;
 		ret = true;
-		stat_inc_read_hit(sbi->sb);
+		stat_inc_read_hit(sbi);
 		goto out;
 	}
 
@@ -292,10 +292,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		et->cached_en = en;
 		spin_unlock(&sbi->extent_lock);
 		ret = true;
-		stat_inc_read_hit(sbi->sb);
+		stat_inc_read_hit(sbi);
 	}
 out:
-	stat_inc_total_hit(sbi->sb);
+	stat_inc_total_hit(sbi);
 	read_unlock(&et->lock);
 
 	trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b18b85267711..38ba525c3d6f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -781,7 +781,8 @@ struct f2fs_sb_info {
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
-	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
+	atomic_t total_hit_ext;			/* # of lookup extent cache */
+	atomic_t read_hit_ext;			/* # of hit extent cache */
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
@@ -1837,8 +1838,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
 #define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
-#define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
-#define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_total_hit(sbi)		(atomic_inc(&(sbi)->total_hit_ext))
+#define stat_inc_read_hit(sbi)		(atomic_inc(&(sbi)->read_hit_ext))
 #define stat_inc_inline_xattr(inode)					\
 	do {								\
 		if (f2fs_has_inline_xattr(inode))			\
-- 
cgit v1.2.3


From 86531d6b84bc096d5d9dbc23333df0ab8d347763 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 15 Jul 2015 13:08:21 -0700
Subject: f2fs: callers take care of the page from bio error

This patch changes for a caller to handle the page after its bio gets an error.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  4 +++-
 fs/f2fs/data.c       | 27 +++++++++++++--------------
 fs/f2fs/node.c       | 21 ++++++++++-----------
 3 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 60327027137f..6fb696da42e8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -69,8 +69,10 @@ repeat:
 
 	fio.page = page;
 
-	if (f2fs_submit_page_bio(&fio))
+	if (f2fs_submit_page_bio(&fio)) {
+		f2fs_put_page(page, 1);
 		goto repeat;
+	}
 
 	lock_page(page);
 	if (unlikely(page->mapping != mapping)) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e58562e70da0..7f51296fbbf6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -156,7 +156,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 
 	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 		bio_put(bio);
-		f2fs_put_page(page, 1);
 		return -EFAULT;
 	}
 
@@ -292,15 +291,13 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
-	if (err) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(err);
-	}
+	if (err)
+		goto put_err;
 	f2fs_put_dnode(&dn);
 
 	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(-ENOENT);
+		err = -ENOENT;
+		goto put_err;
 	}
 got_it:
 	if (PageUptodate(page)) {
@@ -325,8 +322,12 @@ got_it:
 	fio.page = page;
 	err = f2fs_submit_page_bio(&fio);
 	if (err)
-		return ERR_PTR(err);
+		goto put_err;
 	return page;
+
+put_err:
+	f2fs_put_page(page, 1);
+	return ERR_PTR(err);
 }
 
 struct page *find_data_page(struct inode *inode, pgoff_t index)
@@ -1322,7 +1323,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct page *page, *ipage;
+	struct page *page = NULL;
+	struct page *ipage;
 	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
 	struct dnode_of_data dn;
 	int err = 0;
@@ -1412,7 +1414,6 @@ put_next:
 
 		lock_page(page);
 		if (unlikely(!PageUptodate(page))) {
-			f2fs_put_page(page, 1);
 			err = -EIO;
 			goto fail;
 		}
@@ -1424,10 +1425,8 @@ put_next:
 		/* avoid symlink page */
 		if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
 			err = f2fs_decrypt_one(inode, page);
-			if (err) {
-				f2fs_put_page(page, 1);
+			if (err)
 				goto fail;
-			}
 		}
 	}
 out_update:
@@ -1440,8 +1439,8 @@ put_fail:
 	f2fs_put_dnode(&dn);
 unlock_fail:
 	f2fs_unlock_op(sbi);
-	f2fs_put_page(page, 1);
 fail:
+	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
 	return err;
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a05eb35a372c..7dd2b9d78a45 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -991,8 +991,7 @@ fail:
 /*
  * Caller should do after getting the following values.
  * 0: f2fs_put_page(page, 0)
- * LOCKED_PAGE: f2fs_put_page(page, 1)
- * error: nothing
+ * LOCKED_PAGE or error: f2fs_put_page(page, 1)
  */
 static int read_node_page(struct page *page, int rw)
 {
@@ -1010,7 +1009,6 @@ static int read_node_page(struct page *page, int rw)
 
 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
 		ClearPageUptodate(page);
-		f2fs_put_page(page, 1);
 		return -ENOENT;
 	}
 
@@ -1041,10 +1039,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 		return;
 
 	err = read_node_page(apage, READA);
-	if (err == 0)
-		f2fs_put_page(apage, 0);
-	else if (err == LOCKED_PAGE)
-		f2fs_put_page(apage, 1);
+	f2fs_put_page(apage, err ? 1 : 0);
 }
 
 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
@@ -1057,10 +1052,12 @@ repeat:
 		return ERR_PTR(-ENOMEM);
 
 	err = read_node_page(page, READ_SYNC);
-	if (err < 0)
+	if (err < 0) {
+		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
-	else if (err != LOCKED_PAGE)
+	} else if (err != LOCKED_PAGE) {
 		lock_page(page);
+	}
 
 	if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
 		ClearPageUptodate(page);
@@ -1096,10 +1093,12 @@ repeat:
 		return ERR_PTR(-ENOMEM);
 
 	err = read_node_page(page, READ_SYNC);
-	if (err < 0)
+	if (err < 0) {
+		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
-	else if (err == LOCKED_PAGE)
+	} else if (err == LOCKED_PAGE) {
 		goto page_hit;
+	}
 
 	blk_start_plug(&plug);
 
-- 
cgit v1.2.3


From 0f825ee6e873ac0daf5394c5ec76ca2f3d540370 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Wed, 15 Jul 2015 18:05:17 +0800
Subject: f2fs: add new interfaces for extent tree

Add a lookup and a insertion interface for extent tree.
The new lookup return the insert position and the prev/next
extents closest to the offset we lookup when find no match.
The new insertion uses above parameters to improve performance.

There are three possible insertions after the lookup in
f2fs_update_extent_tree, two of them insert parts of removed extent
back to tree, since no merge happens during this process, new insertion
skips the merge check in this scanario; the another insertion inserts a
new extent to tree, new insertion uses prev/next extent and insert
position to insert this extent directly, and save the time of searching
down the tree.

As long as tree remains unchanged between lookup and insertion, this
would work fine. And the new lookup would be useful when add
multi-blocks extent support for insertion interface.

Signed-off-by: Fan li <fanofcode.li@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 132 insertions(+), 7 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 362df8cd54d4..32fae8ad5b7e 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -302,6 +302,126 @@ out:
 	return ret;
 }
 
+
+/*
+ * lookup extent at @fofs, if hit, return the extent
+ * if not, return NULL and
+ * @prev_ex: extent before fofs
+ * @next_ex: extent after fofs
+ * @insert_p: insert point for new extent at fofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
+				unsigned int fofs, struct extent_node **prev_ex,
+				struct extent_node **next_ex,
+				struct rb_node ***insert_p,
+				struct rb_node **insert_parent)
+{
+	struct rb_node **pnode = &et->root.rb_node;
+	struct rb_node *parent = NULL, *tmp_node;
+	struct extent_node *en;
+
+	if (et->cached_en) {
+		struct extent_info *cei = &et->cached_en->ei;
+
+		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+			return et->cached_en;
+	}
+
+	while (*pnode) {
+		parent = *pnode;
+		en = rb_entry(*pnode, struct extent_node, rb_node);
+
+		if (fofs < en->ei.fofs)
+			pnode = &(*pnode)->rb_left;
+		else if (fofs >= en->ei.fofs + en->ei.len)
+			pnode = &(*pnode)->rb_right;
+		else
+			return en;
+	}
+
+	*insert_p = pnode;
+	*insert_parent = parent;
+
+	en = rb_entry(parent, struct extent_node, rb_node);
+	tmp_node = parent;
+	if (parent && fofs > en->ei.fofs)
+		tmp_node = rb_next(parent);
+	*next_ex = tmp_node ?
+		rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+
+	tmp_node = parent;
+	if (parent && fofs < en->ei.fofs)
+		tmp_node = rb_prev(parent);
+	*prev_ex = tmp_node ?
+		rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+
+	return NULL;
+}
+
+static struct extent_node *__insert_extent_tree_ret(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei,
+				struct extent_node **den,
+				struct extent_node *prev_ex,
+				struct extent_node *next_ex,
+				struct rb_node **insert_p,
+				struct rb_node *insert_parent)
+{
+	struct rb_node **p = &et->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_node *en = NULL;
+	int merged = 0;
+
+	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+		f2fs_bug_on(sbi, !den);
+		merged = 1;
+		prev_ex->ei.len += ei->len;
+		ei = &prev_ex->ei;
+		en = prev_ex;
+	}
+	if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+		f2fs_bug_on(sbi, !den);
+		if (merged++) {
+			__detach_extent_node(sbi, et, prev_ex);
+			*den = prev_ex;
+		}
+		next_ex->ei.fofs = ei->fofs;
+		next_ex->ei.blk = ei->blk;
+		next_ex->ei.len += ei->len;
+		en = next_ex;
+	}
+	if (merged)
+		goto update_out;
+
+	if (insert_p && insert_parent) {
+		parent = insert_parent;
+		p = insert_p;
+		goto do_insert;
+	}
+
+	while (*p) {
+		parent = *p;
+		en = rb_entry(parent, struct extent_node, rb_node);
+
+		if (ei->fofs < en->ei.fofs)
+			p = &(*p)->rb_left;
+		else if (ei->fofs >= en->ei.fofs + en->ei.len)
+			p = &(*p)->rb_right;
+		else
+			f2fs_bug_on(sbi, 1);
+	}
+do_insert:
+	en = __attach_extent_node(sbi, et, ei, parent, p);
+	if (!en)
+		return NULL;
+update_out:
+	if (en->ei.len > et->largest.len)
+		et->largest = en->ei;
+	et->cached_en = en;
+	return en;
+}
+
 /* return true, if on-disk extent should be updated */
 static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 							block_t blkaddr)
@@ -309,8 +429,9 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
-	struct extent_node *den = NULL;
+	struct extent_node *den = NULL, *prev_ex = NULL, *next_ex = NULL;
 	struct extent_info ei, dei, prev;
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
 	unsigned int endofs;
 
 	if (!et)
@@ -332,20 +453,22 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 	f2fs_drop_largest_extent(inode, fofs);
 
 	/* 1. lookup and remove existing extent info in cache */
-	en = __lookup_extent_tree(et, fofs);
+	en = __lookup_extent_tree_ret(et, fofs, &prev_ex, &next_ex,
+					&insert_p, &insert_parent);
 	if (!en)
 		goto update_extent;
 
 	dei = en->ei;
 	__detach_extent_node(sbi, et, en);
 
-	/* 2. if extent can be split more, split and insert the left part */
+	/* 2. if extent can be split, try to split it */
 	if (dei.len > F2FS_MIN_EXTENT_LEN) {
 		/*  insert left part of split extent into cache */
 		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
 			set_extent_info(&ei, dei.fofs, dei.blk,
-							fofs - dei.fofs);
-			en1 = __insert_extent_tree(sbi, et, &ei, NULL);
+						fofs - dei.fofs);
+			en1 = __insert_extent_tree_ret(sbi, et, &ei, NULL,
+						NULL, NULL, NULL, NULL);
 		}
 
 		/* insert right part of split extent into cache */
@@ -353,7 +476,8 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
 			set_extent_info(&ei, fofs + 1,
 				fofs - dei.fofs + dei.blk + 1, endofs - fofs);
-			en2 = __insert_extent_tree(sbi, et, &ei, NULL);
+			en2 = __insert_extent_tree_ret(sbi, et, &ei, NULL,
+						NULL, NULL, NULL, NULL);
 		}
 	}
 
@@ -361,7 +485,8 @@ update_extent:
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
 		set_extent_info(&ei, fofs, blkaddr, 1);
-		en3 = __insert_extent_tree(sbi, et, &ei, &den);
+		en3 = __insert_extent_tree_ret(sbi, et, &ei, &den,
+				prev_ex, next_ex, insert_p, insert_parent);
 
 		/* give up extent_cache, if split and small updates happen */
 		if (dei.len >= 1 &&
-- 
cgit v1.2.3


From ecbaa4068f88f96a8ffde37d532e618508394b53 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 16 Jul 2015 18:18:11 +0800
Subject: f2fs: reduce region of cp_rwsem covered in f2fs_do_collapse

In f2fs_do_collapse, region cp_rwsem covered is large, since it will be
held until all blocks are left shifted, so if we try to collapse small
area at the beginning of large file, checkpoint who want to grab writer's
lock of cp_rwsem will be delayed for long time.

In order to avoid this condition, altering to lock/unlock cp_rwsem each
shift operation.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9c40f8cfb77c..d0114710648e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -800,11 +800,11 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
 	pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
 	int ret = 0;
 
-	f2fs_lock_op(sbi);
-
 	for (; end < nrpages; start++, end++) {
 		block_t new_addr, old_addr;
 
+		f2fs_lock_op(sbi);
+
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 		ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
 		if (ret && ret != -ENOENT) {
@@ -820,13 +820,16 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
 		if (new_addr == NULL_ADDR) {
 			set_new_dnode(&dn, inode, NULL, NULL, 0);
 			ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
-			if (ret && ret != -ENOENT)
+			if (ret && ret != -ENOENT) {
 				goto out;
-			else if (ret == -ENOENT)
+			} else if (ret == -ENOENT) {
+				f2fs_unlock_op(sbi);
 				continue;
+			}
 
 			if (dn.data_blkaddr == NULL_ADDR) {
 				f2fs_put_dnode(&dn);
+				f2fs_unlock_op(sbi);
 				continue;
 			} else {
 				truncate_data_blocks_range(&dn, 1);
@@ -865,8 +868,9 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
 
 			f2fs_put_dnode(&dn);
 		}
+		f2fs_unlock_op(sbi);
 	}
-	ret = 0;
+	return 0;
 out:
 	f2fs_unlock_op(sbi);
 	return ret;
-- 
cgit v1.2.3


From 55f57d2c4259a9a4048cf4629a2c6ba53729188d Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 16 Jul 2015 18:19:02 +0800
Subject: f2fs: fix double lock in handle_failed_inode

In handle_failed_inode, there is a potential deadlock which can happen
in below call path:

- f2fs_create
 - f2fs_lock_op   down_read(cp_rwsem)
 - f2fs_add_link
  - __f2fs_add_link
   - init_inode_metadata
    - f2fs_init_security    failed
    - truncate_blocks    failed
 - handle_failed_inode
  - f2fs_truncate
   - truncate_blocks(..,true)
					- write_checkpoint
					 - block_operations
					  - f2fs_lock_all  down_write(cp_rwsem)
    - f2fs_lock_op   down_read(cp_rwsem)

So in this path, we pass parameter to f2fs_truncate to make sure
cp_rwsem in truncate_blocks will not be locked again.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 2 +-
 fs/f2fs/file.c  | 6 +++---
 fs/f2fs/inode.c | 4 ++--
 fs/f2fs/super.c | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 38ba525c3d6f..e73f2e2453f9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1579,7 +1579,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 int truncate_blocks(struct inode *, u64, bool);
-void f2fs_truncate(struct inode *);
+void f2fs_truncate(struct inode *, bool);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d0114710648e..15df014aadc7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -579,7 +579,7 @@ out:
 	return err;
 }
 
-void f2fs_truncate(struct inode *inode)
+void f2fs_truncate(struct inode *inode, bool lock)
 {
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 				S_ISLNK(inode->i_mode)))
@@ -593,7 +593,7 @@ void f2fs_truncate(struct inode *inode)
 			return;
 	}
 
-	if (!truncate_blocks(inode, i_size_read(inode), true)) {
+	if (!truncate_blocks(inode, i_size_read(inode), lock)) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 	}
@@ -656,7 +656,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		if (attr->ia_size <= i_size_read(inode)) {
 			truncate_setsize(inode, attr->ia_size);
-			f2fs_truncate(inode);
+			f2fs_truncate(inode, true);
 			f2fs_balance_fs(F2FS_I_SB(inode));
 		} else {
 			/*
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 5b7547f0bdea..cc4f1082419a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -343,7 +343,7 @@ void f2fs_evict_inode(struct inode *inode)
 	i_size_write(inode, 0);
 
 	if (F2FS_HAS_BLOCKS(inode))
-		f2fs_truncate(inode);
+		f2fs_truncate(inode, true);
 
 	f2fs_lock_op(sbi);
 	remove_inode_page(inode);
@@ -385,7 +385,7 @@ void handle_failed_inode(struct inode *inode)
 
 	i_size_write(inode, 0);
 	if (F2FS_HAS_BLOCKS(inode))
-		f2fs_truncate(inode);
+		f2fs_truncate(inode, false);
 
 	remove_inode_page(inode);
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 0083b8559c9b..12eb69dd38af 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -467,7 +467,7 @@ static int f2fs_drop_inode(struct inode *inode)
 			i_size_write(inode, 0);
 
 			if (F2FS_HAS_BLOCKS(inode))
-				f2fs_truncate(inode);
+				f2fs_truncate(inode, true);
 
 			sb_end_intwrite(inode->i_sb);
 
-- 
cgit v1.2.3


From 737f18992ee81cab897336e84c5c7f4e179dfd61 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <kernelpatch@126.com>
Date: Fri, 17 Jul 2015 12:56:00 +0800
Subject: f2fs: optimize f2fs_write_cache_pages

The if statement "goto continue_unlock" is exactly the same when
each if condition is true that is depended on the value of both
"step" and "is_cold_data(page)" are 0 or 1. That means when the
value of "step" equals to "is_cold_data(page)", the if condition
is true and the if statement "goto continue_unlock" appears only
once, so it can be optimized to reduce the duplicated code.

Signed-off-by: Tiezhu Yang <kernelpatch@126.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7f51296fbbf6..801b0b0b08f4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1207,9 +1207,7 @@ continue_unlock:
 				goto continue_unlock;
 			}
 
-			if (step == 0 && !is_cold_data(page))
-				goto continue_unlock;
-			if (step == 1 && is_cold_data(page))
+			if (step == is_cold_data(page))
 				goto continue_unlock;
 
 			if (PageWriteback(page)) {
-- 
cgit v1.2.3


From 6a2905443cf27f9c14889428f14fccfb98ed97f4 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 17 Jul 2015 18:02:39 +0800
Subject: f2fs: skip writing in ->writepages when no dirty pages exist

When flushing comes from background, if there is no dirty page in the
mapping of inode, we'd better to skip seeking dirty page from mapping
for writebacking.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 801b0b0b08f4..e4081fc91012 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1275,6 +1275,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	if (!mapping->a_ops->writepage)
 		return 0;
 
+	/* skip writing if there is no dirty page in this inode */
+	if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
+		return 0;
+
 	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
 			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
 			available_free_memory(sbi, DIRTY_DENTS))
-- 
cgit v1.2.3


From a5f64b6aa69b5cc05e198291811a2f3faf95b463 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 17 Jul 2015 18:05:21 +0800
Subject: f2fs: fix to wait all atomic written pages writeback

This patch fixes the incorrect range (0, LONG_MAX) which is used
in ranged fsync. If we use LONG_MAX as the parameter for indicating
the end of file we want to synchronize, in 32-bits architecture
machine, these datas after 4GB offset may not be persisted in
storage after ->fsync returned.

Here, we alter LONG_MAX to LLONG_MAX to fix this issue.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 15df014aadc7..d4da7fec757d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1357,7 +1357,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 		commit_inmem_pages(inode, false);
 	}
 
-	ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+	ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
 	mnt_drop_write_file(filp);
 	return ret;
 }
-- 
cgit v1.2.3


From f4c9c743acedc2f083e6a1d4e186df6a2c12b2fd Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 17 Jul 2015 18:06:35 +0800
Subject: f2fs: convert inline data before set atomic/volatile flag

In f2fs_ioc_start_{atomic,volatile}_write, if we failed in converting
inline data, we will report error to user, but still remain atomic/volatile
flag in inode, it will impact further writes for this file. Fix it.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d4da7fec757d..25d1a2f501dc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1323,6 +1323,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
 static int f2fs_ioc_start_atomic_write(struct file *filp)
 {
 	struct inode *inode = file_inode(filp);
+	int ret;
 
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
@@ -1332,9 +1333,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	if (f2fs_is_atomic_file(inode))
 		return 0;
 
-	set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
-	return f2fs_convert_inline_inode(inode);
+	set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+	return 0;
 }
 
 static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -1365,6 +1369,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 static int f2fs_ioc_start_volatile_write(struct file *filp)
 {
 	struct inode *inode = file_inode(filp);
+	int ret;
 
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
@@ -1372,9 +1377,12 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	if (f2fs_is_volatile_file(inode))
 		return 0;
 
-	set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
-	return f2fs_convert_inline_inode(inode);
+	set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	return 0;
 }
 
 static int f2fs_ioc_release_volatile_write(struct file *filp)
-- 
cgit v1.2.3


From e4e762723a90109c968c6c58f7d9bf4541c22928 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 24 Jul 2015 18:24:45 +0800
Subject: f2fs: fix inline data/dentry stat number leak

If we clear inline data/dentry flag in handle_failed_inode, we will fail
to decline the stat count of inline data/dentry in f2fs_evict_inode due
to no flag in inode. So remove the wrong clearing.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cc4f1082419a..83354433d4d1 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -390,8 +390,6 @@ void handle_failed_inode(struct inode *inode)
 	remove_inode_page(inode);
 
 	set_inode_flag(F2FS_I(inode), FI_FREE_NID);
-	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
-	clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
 	f2fs_unlock_op(sbi);
 
 	/* iput will drop the inode object */
-- 
cgit v1.2.3


From a6d494b6d84697f954aaade204e8a5843078a94f Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 24 Jul 2015 18:26:26 +0800
Subject: f2fs: fix to build free nids from readaheaded nat pages

When there is no enough free nids in free nid cache, we will try to
readahead FREE_NID_PAGES:4 nat pages into page cache of meta_inode,
then, reading nat entries in nat page for adding free nids to free nid
cache.

But when traversing all nat pages we readaheaded in a circulation,
our exit condition is not set right, one more nat page will be scanned
without readaheading, resulting worse read performance.

This patch fixes to read the correct number nat pages to avoid bad
performance.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7dd2b9d78a45..ac9110788b17 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1532,7 +1532,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 		if (unlikely(nid >= nm_i->max_nid))
 			nid = 0;
 
-		if (i++ == FREE_NID_PAGES)
+		if (++i >= FREE_NID_PAGES)
 			break;
 	}
 
-- 
cgit v1.2.3


From edb27deea7cabfff8feb8c62aae647b7673be734 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 25 Jul 2015 00:52:52 -0700
Subject: f2fs: handle error cases in commit_inmem_pages

This patch adds to handle error cases in commit_inmem_pages.
If an error occurs, it stops to write the pages and return the error right
away.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  2 +-
 fs/f2fs/file.c    |  7 +++++--
 fs/f2fs/segment.c | 10 ++++++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e73f2e2453f9..58b05b541a4e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1697,7 +1697,7 @@ void destroy_node_manager_caches(void);
  * segment.c
  */
 void register_inmem_page(struct inode *, struct page *);
-void commit_inmem_pages(struct inode *, bool);
+int commit_inmem_pages(struct inode *, bool);
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 25d1a2f501dc..be69a01060a6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1358,10 +1358,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 
 	if (f2fs_is_atomic_file(inode)) {
 		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-		commit_inmem_pages(inode, false);
+		ret = commit_inmem_pages(inode, false);
+		if (ret)
+			goto err_out;
 	}
 
 	ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+err_out:
 	mnt_drop_write_file(filp);
 	return ret;
 }
@@ -1418,7 +1421,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 
 	if (f2fs_is_atomic_file(inode)) {
 		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-		commit_inmem_pages(inode, false);
+		commit_inmem_pages(inode, true);
 	}
 
 	if (f2fs_is_volatile_file(inode))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f7bfc3b7d934..509a2c4bb7d3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -227,7 +227,7 @@ retry:
 	trace_f2fs_register_inmem_page(page, INMEM);
 }
 
-void commit_inmem_pages(struct inode *inode, bool abort)
+int commit_inmem_pages(struct inode *inode, bool abort)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -239,6 +239,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
 		.rw = WRITE_SYNC | REQ_PRIO,
 		.encrypted_page = NULL,
 	};
+	int err = 0;
 
 	/*
 	 * The abort is true only when f2fs_evict_inode is called.
@@ -263,8 +264,12 @@ void commit_inmem_pages(struct inode *inode, bool abort)
 					inode_dec_dirty_pages(inode);
 				trace_f2fs_commit_inmem_page(cur->page, INMEM);
 				fio.page = cur->page;
-				do_write_data_page(&fio);
+				err = do_write_data_page(&fio);
 				submit_bio = true;
+				if (err) {
+					unlock_page(cur->page);
+					break;
+				}
 			}
 			f2fs_put_page(cur->page, 1);
 		} else {
@@ -283,6 +288,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
 		if (submit_bio)
 			f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	}
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 5768dcdd7f7675f9540e648428c8a1cd7208a0fe Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Tue, 4 Aug 2015 13:27:51 +0800
Subject: f2fs: change the timing of f2fs_wait_on_page_writeback

some backing devices need pages to be stable during writeback. It doesn't
matter if
the page is completely overwritten or already uptodate, it needs to wait
before write.

Signed-off-by: Fan li <fanofcode.li@samsung.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e4081fc91012..2692848e7f75 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1383,13 +1383,13 @@ put_next:
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
 
+	f2fs_wait_on_page_writeback(page, DATA);
+
 	if (len == PAGE_CACHE_SIZE)
 		goto out_update;
 	if (PageUptodate(page))
 		goto out_clear;
 
-	f2fs_wait_on_page_writeback(page, DATA);
-
 	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
 		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
 		unsigned end = start + len;
-- 
cgit v1.2.3


From f3f338caad3428fbc4bb563828efc6ecce4d956b Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 29 Jul 2015 17:33:13 +0800
Subject: f2fs: freeze filesystem when fail to update meta page due to IO error

In get_meta_page, we guarantee no failure for the returned page,
but sometimes, IO error from device will incur returning an
non-updated page.

Then, we still use this page as updated one, exception could happen
when using this kind of page.

So in this condition, we'd better freeze fs by making fs readonly and
and stop doing checkpoint.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6fb696da42e8..9c1acf69bfbb 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -79,6 +79,14 @@ repeat:
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
+
+	/*
+	 * if there is any IO error when accessing device, make our filesystem
+	 * readonly and make sure do not write checkpoint with non-uptodate
+	 * meta page.
+	 */
+	if (unlikely(!PageUptodate(page)))
+		f2fs_stop_checkpoint(sbi);
 out:
 	return page;
 }
-- 
cgit v1.2.3


From 7a04f64d4d5367ade827d75388d66054b535e201 Mon Sep 17 00:00:00 2001
From: Liu Xue <liuxueliu.liu@huawei.com>
Date: Mon, 27 Jul 2015 10:17:59 +0000
Subject: f2fs: unify f2fs_bug_on when check blocks and segment

Replace BUG_ON with f2fs_bug_on to deal with
block and segment validity check failed.

Signed-off-by: Xue Liu <liuxueliu.liu@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.h | 45 ++++++++-------------------------------------
 1 file changed, 8 insertions(+), 37 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 79e7b879a753..230f9cd9fa2a 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -555,16 +555,15 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
 	return curseg->next_blkoff;
 }
 
-#ifdef CONFIG_F2FS_CHECK_FS
 static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 {
-	BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
+	f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
 }
 
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
-	BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
-	BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
+	f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi)
+					|| blk_addr >= MAX_BLKADDR(sbi));
 }
 
 /*
@@ -577,12 +576,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 	int valid_blocks = 0;
 	int cur_pos = 0, next_pos;
 
-	/* check segment usage */
-	BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
-
-	/* check boundary of a given segment number */
-	BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
+	/* check segment usage, and check boundary of a given segment number */
+	f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+					|| segno > TOTAL_SEGS(sbi) - 1);
 
+#ifdef CONFIG_F2FS_CHECK_FS
 	/* check bitmap with valid block count */
 	do {
 		if (is_valid) {
@@ -598,35 +596,8 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 		is_valid = !is_valid;
 	} while (cur_pos < sbi->blocks_per_seg);
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
-}
-#else
-static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
-{
-	if (segno > TOTAL_SEGS(sbi) - 1)
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-}
-
-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
-{
-	if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-}
-
-/*
- * Summary block is always treated as an invalid block
- */
-static inline void check_block_count(struct f2fs_sb_info *sbi,
-		int segno, struct f2fs_sit_entry *raw_sit)
-{
-	/* check segment usage */
-	if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-
-	/* check boundary of a given segment number */
-	if (segno > TOTAL_SEGS(sbi) - 1)
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-}
 #endif
+}
 
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
 						unsigned int start)
-- 
cgit v1.2.3


From 470f00e9686f0b338a457568229fe7b7d44b8e6a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 14 Jul 2015 18:14:06 +0800
Subject: f2fs: fix to release inode page correctly

In following call path, we will pass a locked and referenced ipage
pointer to get_new_data_page:
 - init_inode_metadata
  - make_empty_dir
   - get_new_data_page

There are two exit paths in get_new_data_page when error occurs:
1) grab_cache_page fails, ipage will not be released;
2) f2fs_reserve_block fails, ipage will be released in callee.

So, it's not consistent for error handling in get_new_data_page.

For f2fs_reserve_block, it's not very easy to change the rule
of error handling, since it's already complicated.

Here we deside to choose an easy way to fix this issue:
If any error occur in get_new_data_page, we will ensure releasing
ipage in this function.

The same issue is in f2fs_convert_inline_dir, fix that too.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c   | 11 +++++++++--
 fs/f2fs/inline.c | 13 ++++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2692848e7f75..f8f93db437ce 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -388,7 +388,8 @@ repeat:
  *
  * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
  * f2fs_unlock_op().
- * Note that, ipage is set only by make_empty_dir.
+ * Note that, ipage is set only by make_empty_dir, and if any error occur,
+ * ipage should be released by this function.
  */
 struct page *get_new_data_page(struct inode *inode,
 		struct page *ipage, pgoff_t index, bool new_i_size)
@@ -399,8 +400,14 @@ struct page *get_new_data_page(struct inode *inode,
 	int err;
 repeat:
 	page = grab_cache_page(mapping, index);
-	if (!page)
+	if (!page) {
+		/*
+		 * before exiting, we should make sure ipage will be released
+		 * if any error occur.
+		 */
+		f2fs_put_page(ipage, 1);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	set_new_dnode(&dn, inode, ipage, NULL, 0);
 	err = f2fs_reserve_block(&dn, index);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a13ffcc32992..79d18d5c1fae 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -360,6 +360,10 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
 	return 0;
 }
 
+/*
+ * NOTE: ipage is grabbed by caller, but if any error occurs, we should
+ * release ipage in this function.
+ */
 static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 				struct f2fs_inline_dentry *inline_dentry)
 {
@@ -369,8 +373,10 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 	int err;
 
 	page = grab_cache_page(dir->i_mapping, 0);
-	if (!page)
+	if (!page) {
+		f2fs_put_page(ipage, 1);
 		return -ENOMEM;
+	}
 
 	set_new_dnode(&dn, dir, ipage, NULL, 0);
 	err = f2fs_reserve_block(&dn, 0);
@@ -434,8 +440,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 						slots, NR_INLINE_DENTRY);
 	if (bit_pos >= NR_INLINE_DENTRY) {
 		err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
-		if (!err)
-			err = -EAGAIN;
+		if (err)
+			return err;
+		err = -EAGAIN;
 		goto out;
 	}
 
-- 
cgit v1.2.3


From e90c2d2850d9d034e814a328725a4b15878f0357 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 28 Jul 2015 18:36:47 +0800
Subject: f2fs: invalidate temporary meta page

To avoid meeting garbage data in next free node block at the end of warm
node chain when doing recovery, we will try to zero out that invalid block.

If the device is not support discard, our way for zeroing out block is:
grabbing a temporary zeroed page in meta inode, then, issue write request
with this page.

But, we forget to release that temporary page, so our memory usage will
increase without gaining any hit ratio benefit, so it's better to free it
for saving memory.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 13 ++++++++++++-
 fs/f2fs/f2fs.h       |  2 +-
 fs/f2fs/recovery.c   | 11 ++++++++++-
 fs/f2fs/segment.c    |  9 ++++++---
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 9c1acf69bfbb..c3111769d382 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -896,12 +896,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	__u32 crc32 = 0;
 	int i;
 	int cp_payload_blks = __cp_payload(sbi);
+	block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
+	bool invalidate = false;
 
 	/*
 	 * This avoids to conduct wrong roll-forward operations and uses
 	 * metapages, so should be called prior to sync_meta_pages below.
 	 */
-	discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
+	if (discard_next_dnode(sbi, discard_blk))
+		invalidate = true;
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
@@ -1030,6 +1033,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* wait for previous submitted meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
+	/*
+	 * invalidate meta page which is used temporarily for zeroing out
+	 * block at the end of warm node chain.
+	 */
+	if (invalidate)
+		invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
+								discard_blk);
+
 	release_dirty_inode(sbi);
 
 	if (unlikely(f2fs_cp_error(sbi)))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 58b05b541a4e..34a524d007ec 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1707,7 +1707,7 @@ void invalidate_blocks(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
-void discard_next_dnode(struct f2fs_sb_info *, block_t);
+bool discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 24a8c1d4f45f..07a36e413ace 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -561,11 +561,20 @@ out:
 
 	clear_sbi_flag(sbi, SBI_POR_DOING);
 	if (err) {
-		discard_next_dnode(sbi, blkaddr);
+		bool invalidate = false;
+
+		if (discard_next_dnode(sbi, blkaddr))
+			invalidate = true;
 
 		/* Flush all the NAT/SIT pages */
 		while (get_pages(sbi, F2FS_DIRTY_META))
 			sync_meta_pages(sbi, META, LONG_MAX);
+
+		/* invalidate temporary meta page */
+		if (invalidate)
+			invalidate_mapping_pages(META_MAPPING(sbi),
+							blkaddr, blkaddr);
+
 		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
 		mutex_unlock(&sbi->cp_mutex);
 	} else if (need_writecp) {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 509a2c4bb7d3..1f1200487c44 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -514,7 +514,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
 
-void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
+bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
 	int err = -ENOTSUPP;
 
@@ -524,13 +524,16 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
 		unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
 
 		if (f2fs_test_bit(offset, se->discard_map))
-			return;
+			return false;
 
 		err = f2fs_issue_discard(sbi, blkaddr, 1);
 	}
 
-	if (err)
+	if (err) {
 		update_meta_page(sbi, NULL, blkaddr);
+		return true;
+	}
+	return false;
 }
 
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
-- 
cgit v1.2.3


From 759af1c9c16fec5323111b799ce25a3d8864df7e Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Wed, 5 Aug 2015 15:52:16 +0800
Subject: f2fs: use extent cache to optimize f2fs_reserve_block

In some cases, we only need the block address when we call
f2fs_reserve_block,
other fields of struct dnode_of_data aren't necessary.
We can try extent cache first for such cases in order to speed up the
process.

Signed-off-by: Fan li <fanofcode.li@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 16 +++++++++++++++-
 fs/f2fs/f2fs.h |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f8f93db437ce..4fabdd47490a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -263,6 +263,19 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	return err;
 }
 
+int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
+{
+	struct extent_info ei;
+	struct inode *inode = dn->inode;
+
+	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+		dn->data_blkaddr = ei.blk + index - ei.fofs;
+		return 0;
+	}
+
+	return f2fs_reserve_block(dn, index);
+}
+
 struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
 {
 	struct address_space *mapping = inode->i_mapping;
@@ -1383,7 +1396,8 @@ repeat:
 		if (err)
 			goto put_fail;
 	}
-	err = f2fs_reserve_block(&dn, index);
+
+	err = f2fs_get_block(&dn, index);
 	if (err)
 		goto put_fail;
 put_next:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 34a524d007ec..09cb365a07cc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1768,6 +1768,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_io_info *);
 void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
+int f2fs_get_block(struct dnode_of_data *, pgoff_t);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 struct page *get_read_data_page(struct inode *, pgoff_t, int);
 struct page *find_data_page(struct inode *, pgoff_t);
-- 
cgit v1.2.3


From 12a8343e99a8af50b2a1cd8da72d34b6e860da0f Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 5 Aug 2015 17:23:54 +0800
Subject: f2fs: recover invalid/reserved block address for fsynced file

When testing with generic/101 in xfstests, error message outputed as below:

    --- tests/generic/101.out
    +++ results//generic/101.out.bad
    @@ -10,10 +10,14 @@
     File foo content after log replay:
     0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
     *
    -0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    +0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
     *
     0372000
    ...
    (Run 'diff -u tests/generic/101.out results/generic/101.out.bad'  to see the entire diff)

The test flow is like below:
1. pwrite foo -S 0xaa 0 64K
2. pwrite foo -S 0xbb 64K 61K
3. sync
4. truncate foo 64K
5. truncate foo 125K
6. fsync foo
7. flakey drop writes
8. umount

After this test, we expect the data of recovered file will have the first
64k of data filling with value 0xaa and the next 61k of data filling with
value 0x00 because we have fsynced it before dropping writes in dm.

In f2fs, during recovering, we will only recover the valid block address
in direct node page if it is marked as a fsynced dnode, but block address
which means invalid/reserved (with value NULL_ADDR/NEW_ADDR) will not be
recovered. So, the file recovered shows its incorrect data 0xbb in range of
[61k, 125k].

In this patch, we fix to recover invalid/reserved block during recover flow.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 07a36e413ace..d2ef0c9f53e7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -399,14 +399,35 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
 	f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
 
-	for (; start < end; start++) {
+	for (; start < end; start++, dn.ofs_in_node++) {
 		block_t src, dest;
 
 		src = datablock_addr(dn.node_page, dn.ofs_in_node);
 		dest = datablock_addr(page, dn.ofs_in_node);
 
-		if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
-			is_valid_blkaddr(sbi, dest, META_POR)) {
+		/* skip recovering if dest is the same as src */
+		if (src == dest)
+			continue;
+
+		/* dest is invalid, just invalidate src block */
+		if (dest == NULL_ADDR) {
+			truncate_data_blocks_range(&dn, 1);
+			continue;
+		}
+
+		/*
+		 * dest is reserved block, invalidate src block
+		 * and then reserve one new block in dnode page.
+		 */
+		if (dest == NEW_ADDR) {
+			truncate_data_blocks_range(&dn, 1);
+			err = reserve_new_block(&dn);
+			f2fs_bug_on(sbi, err);
+			continue;
+		}
+
+		/* dest is valid block, try to recover from src to dest */
+		if (is_valid_blkaddr(sbi, dest, META_POR)) {
 
 			if (src == NULL_ADDR) {
 				err = reserve_new_block(&dn);
@@ -424,7 +445,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 							ni.version, false);
 			recovered++;
 		}
-		dn.ofs_in_node++;
 	}
 
 	if (IS_INODE(dn.node_page))
-- 
cgit v1.2.3


From 6394328ab8a2ab6b127ae85f716943d92595878d Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 7 Aug 2015 18:36:06 +0800
Subject: f2fs: report error of fill_zero

fill_zero can fail due to a lot of reason, but previously we do not handle
its return value, so its callers such as punch_hole/f2fs_zero_range may
report success, but actually can fail because of error occurs inside
fill_zero.

This patch fixes to report correct return value of fill_zero.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 56 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index be69a01060a6..016ed3ba2ca4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -695,14 +695,14 @@ const struct inode_operations f2fs_file_inode_operations = {
 	.fiemap		= f2fs_fiemap,
 };
 
-static void fill_zero(struct inode *inode, pgoff_t index,
+static int fill_zero(struct inode *inode, pgoff_t index,
 					loff_t start, loff_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page;
 
 	if (!len)
-		return;
+		return 0;
 
 	f2fs_balance_fs(sbi);
 
@@ -710,12 +710,14 @@ static void fill_zero(struct inode *inode, pgoff_t index,
 	page = get_new_data_page(inode, NULL, index, false);
 	f2fs_unlock_op(sbi);
 
-	if (!IS_ERR(page)) {
-		f2fs_wait_on_page_writeback(page, DATA);
-		zero_user(page, start, len);
-		set_page_dirty(page);
-		f2fs_put_page(page, 1);
-	}
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	f2fs_wait_on_page_writeback(page, DATA);
+	zero_user(page, start, len);
+	set_page_dirty(page);
+	f2fs_put_page(page, 1);
+	return 0;
 }
 
 int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
@@ -763,14 +765,22 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
 
 	if (pg_start == pg_end) {
-		fill_zero(inode, pg_start, off_start,
+		ret = fill_zero(inode, pg_start, off_start,
 						off_end - off_start);
+		if (ret)
+			return ret;
 	} else {
-		if (off_start)
-			fill_zero(inode, pg_start++, off_start,
-					PAGE_CACHE_SIZE - off_start);
-		if (off_end)
-			fill_zero(inode, pg_end, 0, off_end);
+		if (off_start) {
+			ret = fill_zero(inode, pg_start++, off_start,
+						PAGE_CACHE_SIZE - off_start);
+			if (ret)
+				return ret;
+		}
+		if (off_end) {
+			ret = fill_zero(inode, pg_end, 0, off_end);
+			if (ret)
+				return ret;
+		}
 
 		if (pg_start < pg_end) {
 			struct address_space *mapping = inode->i_mapping;
@@ -961,14 +971,21 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
 
 	if (pg_start == pg_end) {
-		fill_zero(inode, pg_start, off_start, off_end - off_start);
+		ret = fill_zero(inode, pg_start, off_start,
+						off_end - off_start);
+		if (ret)
+			return ret;
+
 		if (offset + len > new_size)
 			new_size = offset + len;
 		new_size = max_t(loff_t, new_size, offset + len);
 	} else {
 		if (off_start) {
-			fill_zero(inode, pg_start++, off_start,
-					PAGE_CACHE_SIZE - off_start);
+			ret = fill_zero(inode, pg_start++, off_start,
+						PAGE_CACHE_SIZE - off_start);
+			if (ret)
+				return ret;
+
 			new_size = max_t(loff_t, new_size,
 						pg_start << PAGE_CACHE_SHIFT);
 		}
@@ -1010,7 +1027,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 		}
 
 		if (off_end) {
-			fill_zero(inode, pg_end, 0, off_end);
+			ret = fill_zero(inode, pg_end, 0, off_end);
+			if (ret)
+				goto out;
+
 			new_size = max_t(loff_t, new_size, offset + len);
 		}
 	}
-- 
cgit v1.2.3


From c15e8599ffe1b4f866691424d07037c467c23a2f Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 7 Aug 2015 18:39:32 +0800
Subject: f2fs: report EINVAL for unalignment direct IO

We run ltp testcase with f2fs and obtain a TFAIL in diotest4, the result in
detail is as fallow:

dio04

<<<test_start>>>
tag=dio04 stime=1432278894
cmdline="diotest4"
contacts=""
analysis=exit
<<<test_output>>>
diotest4    1  TPASS  :  Negative Offset
diotest4    2  TPASS  :  removed
diotest4    3  TFAIL  :  diotest4.c:129: write allows odd count.returns 1: Success
diotest4    4  TFAIL  :  diotest4.c:183: Odd count of read and write
diotest4    5  TPASS  :  Read beyond the file size
......

the result of ext4 with same environment:

dio04

<<<test_start>>>
tag=dio04 stime=1432259643
cmdline="diotest4"
contacts=""
analysis=exit
<<<test_output>>>
diotest4    1  TPASS  :  Negative Offset
diotest4    2  TPASS  :  removed
diotest4    3  TPASS  :  Odd count of read and write
diotest4    4  TPASS  :  Read beyond the file size
......

The reason is that when triggering DIO in f2fs, we will return zero value
in ->direct_IO if writer's buffer offset, file offset and transfer size is
not alignment to block size of filesystem, resulting in falling back into
buffered write instead of returning -EINVAL.

This patch fixes that problem by returning correct error number for above
case, and removing the judgement condition in check_direct_IO to make sure
the verification will be enabled for direct reader too.

Besides, Jaegeuk Kim pointed out that there is expectional cases we should
always make direct-io falling back into buffered write, such as dio in
encrypted file.

Signed-off-by: Yunlei He <heyunlei@huawei.com>
[Chao Yu make small change and add detail description in commit message]
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4fabdd47490a..7ea8eda8f137 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1494,9 +1494,6 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
 {
 	unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
 
-	if (iov_iter_rw(iter) == READ)
-		return 0;
-
 	if (offset & blocksize_mask)
 		return -EINVAL;
 
@@ -1525,8 +1522,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		return 0;
 
-	if (check_direct_IO(inode, iter, offset))
-		return 0;
+	err = check_direct_IO(inode, iter, offset);
+	if (err)
+		return err;
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
-- 
cgit v1.2.3


From decd36b6c43a1051bab97571cf4c0ec8450268b0 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 7 Aug 2015 18:42:09 +0800
Subject: f2fs: remove inmem radix tree

Previously, we use radix tree to index all registered page entries for
atomic file, but now we only use radix tree to see whether current page
is indexed or not, since the other user of radix tree is gone in commit
042b7816aaeb ("f2fs: remove unnecessary call to invalidate inmemory pages").

So in this patch, we try to use one more efficient way:
Introducing a macro ATOMIC_WRITTEN_PAGE, and setting it as page private
value to indicate page indexing status. By using this way, we can save
memory and lookup time.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    | 20 ++++++++++++++++++--
 fs/f2fs/f2fs.h    |  1 -
 fs/f2fs/segment.c | 25 +++++++++----------------
 fs/f2fs/segment.h |  9 +++++++++
 fs/f2fs/super.c   |  1 -
 5 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ea8eda8f137..cad9ebe45692 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1558,6 +1558,11 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 		else
 			inode_dec_dirty_pages(inode);
 	}
+
+	/* This is atomic written page, keep Private */
+	if (IS_ATOMIC_WRITTEN_PAGE(page))
+		return;
+
 	ClearPagePrivate(page);
 }
 
@@ -1567,6 +1572,10 @@ int f2fs_release_page(struct page *page, gfp_t wait)
 	if (PageDirty(page))
 		return 0;
 
+	/* This is atomic written page, keep Private */
+	if (IS_ATOMIC_WRITTEN_PAGE(page))
+		return 0;
+
 	ClearPagePrivate(page);
 	return 1;
 }
@@ -1581,8 +1590,15 @@ static int f2fs_set_data_page_dirty(struct page *page)
 	SetPageUptodate(page);
 
 	if (f2fs_is_atomic_file(inode)) {
-		register_inmem_page(inode, page);
-		return 1;
+		if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
+			register_inmem_page(inode, page);
+			return 1;
+		}
+		/*
+		 * Previously, this page has been registered, we just
+		 * return here.
+		 */
+		return 0;
 	}
 
 	if (!PageDirty(page)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 09cb365a07cc..38847942edeb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -424,7 +424,6 @@ struct f2fs_inode_info {
 	unsigned long long xattr_ver;	/* cp version of xattr modification */
 	struct inode_entry *dirty_dir;	/* the pointer of dirty dir */
 
-	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1f1200487c44..7d53cb44c617 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -197,28 +197,20 @@ void register_inmem_page(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct inmem_pages *new;
-	int err;
 
-	SetPagePrivate(page);
 	f2fs_trace_pid(page);
 
+	set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
+	SetPagePrivate(page);
+
 	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
 
 	/* add atomic page indices to the list */
 	new->page = page;
 	INIT_LIST_HEAD(&new->list);
-retry:
+
 	/* increase reference count with clean state */
 	mutex_lock(&fi->inmem_lock);
-	err = radix_tree_insert(&fi->inmem_root, page->index, new);
-	if (err == -EEXIST) {
-		mutex_unlock(&fi->inmem_lock);
-		kmem_cache_free(inmem_entry_slab, new);
-		return;
-	} else if (err) {
-		mutex_unlock(&fi->inmem_lock);
-		goto retry;
-	}
 	get_page(page);
 	list_add_tail(&new->list, &fi->inmem_pages);
 	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
@@ -255,8 +247,8 @@ int commit_inmem_pages(struct inode *inode, bool abort)
 
 	mutex_lock(&fi->inmem_lock);
 	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+		lock_page(cur->page);
 		if (!abort) {
-			lock_page(cur->page);
 			if (cur->page->mapping == inode->i_mapping) {
 				set_page_dirty(cur->page);
 				f2fs_wait_on_page_writeback(cur->page, DATA);
@@ -271,12 +263,13 @@ int commit_inmem_pages(struct inode *inode, bool abort)
 					break;
 				}
 			}
-			f2fs_put_page(cur->page, 1);
 		} else {
 			trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
-			put_page(cur->page);
 		}
-		radix_tree_delete(&fi->inmem_root, cur->page->index);
+		set_page_private(cur->page, 0);
+		ClearPagePrivate(cur->page);
+		f2fs_put_page(cur->page, 1);
+
 		list_del(&cur->list);
 		kmem_cache_free(inmem_entry_slab, cur);
 		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 230f9cd9fa2a..d0bd952b7065 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -177,6 +177,15 @@ struct segment_allocation {
 	void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
 };
 
+/*
+ * this value is set in page as a private data which indicate that
+ * the page is atomically written, and it is in inmem_pages list.
+ */
+#define ATOMIC_WRITTEN_PAGE		0x0000ffff
+
+#define IS_ATOMIC_WRITTEN_PAGE(page)			\
+		(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+
 struct inmem_pages {
 	struct list_head list;
 	struct page *page;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 12eb69dd38af..a79b6b5a4eeb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -423,7 +423,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
-	INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
 	INIT_LIST_HEAD(&fi->inmem_pages);
 	mutex_init(&fi->inmem_lock);
 
-- 
cgit v1.2.3


From 47e70ca46f9074efe6573263c0de5bef0af829de Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 11 Aug 2015 10:17:27 -0700
Subject: f2fs: do not assign a new segment for dio under space shortage

If there is not enough free segment, we should not assign a new segment
explicitly. Otherwise, we can run out of free segment.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7d53cb44c617..bf1605dbce93 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1225,7 +1225,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	mutex_lock(&sit_i->sentry_lock);
 
 	/* direct_io'ed data is aligned to the segment for better performance */
-	if (direct_io && curseg->next_blkoff)
+	if (direct_io && curseg->next_blkoff &&
+				!has_not_enough_free_secs(sbi, 0))
 		__allocate_new_segments(sbi, type);
 
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-- 
cgit v1.2.3


From 8c14bfadeac2a01b305ef4434907295b81b58db2 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 7 Aug 2015 17:58:43 +0800
Subject: f2fs: handle error of f2fs_iget correctly

In recover_orphan_inode, whenever f2fs_iget fail, we will make kernel panic,
but it's not reasonable, because f2fs_iget can fail due to a lot of reasons
including out of memory.

So we change error handling method as below:
a) when finding no entry for the orphan inode, bug_on for catching bugs;
b) for other reasons, report it to caller.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 31 ++++++++++++++++++++++++-------
 fs/f2fs/f2fs.h       |  2 +-
 fs/f2fs/super.c      |  4 +++-
 3 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c3111769d382..0958c8399d8e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -468,22 +468,34 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	__remove_ino_entry(sbi, ino, ORPHAN_INO);
 }
 
-static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	struct inode *inode = f2fs_iget(sbi->sb, ino);
-	f2fs_bug_on(sbi, IS_ERR(inode));
+	struct inode *inode;
+
+	inode = f2fs_iget(sbi->sb, ino);
+	if (IS_ERR(inode)) {
+		/*
+		 * there should be a bug that we can't find the entry
+		 * to orphan inode.
+		 */
+		f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT);
+		return PTR_ERR(inode);
+	}
+
 	clear_nlink(inode);
 
 	/* truncate all the data during iput */
 	iput(inode);
+	return 0;
 }
 
-void recover_orphan_inodes(struct f2fs_sb_info *sbi)
+int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 {
 	block_t start_blk, orphan_blocks, i, j;
+	int err;
 
 	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
-		return;
+		return 0;
 
 	set_sbi_flag(sbi, SBI_POR_DOING);
 
@@ -499,14 +511,19 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
 		orphan_blk = (struct f2fs_orphan_block *)page_address(page);
 		for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
 			nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
-			recover_orphan_inode(sbi, ino);
+			err = recover_orphan_inode(sbi, ino);
+			if (err) {
+				f2fs_put_page(page, 1);
+				clear_sbi_flag(sbi, SBI_POR_DOING);
+				return err;
+			}
 		}
 		f2fs_put_page(page, 1);
 	}
 	/* clear Orphan Flag */
 	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
 	clear_sbi_flag(sbi, SBI_POR_DOING);
-	return;
+	return 0;
 }
 
 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 38847942edeb..cc07b1595a92 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1748,7 +1748,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
 void add_orphan_inode(struct f2fs_sb_info *, nid_t);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-void recover_orphan_inodes(struct f2fs_sb_info *);
+int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void update_dirty_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a79b6b5a4eeb..4db5cd9fb4b9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1245,7 +1245,9 @@ try_onemore:
 	f2fs_join_shrinker(sbi);
 
 	/* if there are nt orphan nodes free them */
-	recover_orphan_inodes(sbi);
+	err = recover_orphan_inodes(sbi);
+	if (err)
+		goto free_node_inode;
 
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
-- 
cgit v1.2.3


From 4c278394b0feb7aadc538be12ab0474b106a7255 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 11 Aug 2015 16:01:30 -0700
Subject: f2fs: avoid a build warning

If F2FS_CHECK_FS is turned off, we can get a build warning for unused variable.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index d0bd952b7065..b6e4ed15c698 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -581,15 +581,11 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 static inline void check_block_count(struct f2fs_sb_info *sbi,
 		int segno, struct f2fs_sit_entry *raw_sit)
 {
+#ifdef CONFIG_F2FS_CHECK_FS
 	bool is_valid  = test_bit_le(0, raw_sit->valid_map) ? true : false;
 	int valid_blocks = 0;
 	int cur_pos = 0, next_pos;
 
-	/* check segment usage, and check boundary of a given segment number */
-	f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
-					|| segno > TOTAL_SEGS(sbi) - 1);
-
-#ifdef CONFIG_F2FS_CHECK_FS
 	/* check bitmap with valid block count */
 	do {
 		if (is_valid) {
@@ -606,6 +602,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 	} while (cur_pos < sbi->blocks_per_seg);
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
 #endif
+	/* check segment usage, and check boundary of a given segment number */
+	f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+					|| segno > TOTAL_SEGS(sbi) - 1);
 }
 
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
-- 
cgit v1.2.3


From 315df8398e36360c0be62e6fdd3f2708fc3a2567 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 11 Aug 2015 12:45:39 -0700
Subject: f2fs: do not write any node pages related to orphan inodes

We should not write node pages when deleting orphan inodes.
In order to do that, we can eaisly set POR_DOING flag earlier before entering
orphan inode routine.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 4 ----
 fs/f2fs/recovery.c   | 4 +---
 fs/f2fs/super.c      | 6 +++++-
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0958c8399d8e..890e4d4c39d7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -497,8 +497,6 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
-	set_sbi_flag(sbi, SBI_POR_DOING);
-
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
 	orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
 
@@ -514,7 +512,6 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 			err = recover_orphan_inode(sbi, ino);
 			if (err) {
 				f2fs_put_page(page, 1);
-				clear_sbi_flag(sbi, SBI_POR_DOING);
 				return err;
 			}
 		}
@@ -522,7 +519,6 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	}
 	/* clear Orphan Flag */
 	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
-	clear_sbi_flag(sbi, SBI_POR_DOING);
 	return 0;
 }
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d2ef0c9f53e7..faec2ca004b9 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -545,14 +545,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 
 	INIT_LIST_HEAD(&inode_list);
 
-	/* step #1: find fsynced inode numbers */
-	set_sbi_flag(sbi, SBI_POR_DOING);
-
 	/* prevent checkpoint */
 	mutex_lock(&sbi->cp_mutex);
 
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
+	/* step #1: find fsynced inode numbers */
 	err = find_fsync_dnodes(sbi, &inode_list);
 	if (err)
 		goto out;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4db5cd9fb4b9..cfe3f9579934 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1165,7 +1165,9 @@ try_onemore:
 	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
 	init_rwsem(&sbi->node_write);
-	clear_sbi_flag(sbi, SBI_POR_DOING);
+
+	/* disallow all the data/node/meta page writes */
+	set_sbi_flag(sbi, SBI_POR_DOING);
 	spin_lock_init(&sbi->stat_lock);
 
 	init_rwsem(&sbi->read_io.io_rwsem);
@@ -1309,6 +1311,8 @@ try_onemore:
 			goto free_kobj;
 		}
 	}
+	/* recover_fsync_data() cleared this already */
+	clear_sbi_flag(sbi, SBI_POR_DOING);
 
 	/*
 	 * If filesystem is not mounted as read-only then
-- 
cgit v1.2.3


From 206e61be29624499af46546076e835da93e6bde5 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 12 Aug 2015 17:48:21 +0800
Subject: f2fs: avoid clear valid page

In f2fs_delete_entry, if last dirent is remove from the dentry page,
we will try to punch that page since it has no valid date in it.

But truncate_hole which is used for punching could fail because of
no memory or IO error, if that happened, we'd better skip clearing
this valid dentry page.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a34ebd8312ab..8f15fc134040 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -718,8 +718,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	if (inode)
 		f2fs_drop_nlink(dir, inode, NULL);
 
-	if (bit_pos == NR_DENTRY_IN_BLOCK) {
-		truncate_hole(dir, page->index, page->index + 1);
+	if (bit_pos == NR_DENTRY_IN_BLOCK &&
+			!truncate_hole(dir, page->index, page->index + 1)) {
 		clear_page_dirty_for_io(page);
 		ClearPagePrivate(page);
 		ClearPageUptodate(page);
-- 
cgit v1.2.3


From 31696580bf4c042a0f7b06d855e04441488d18b1 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 28 Jul 2015 18:33:46 +0800
Subject: f2fs: shrink free_nids entries

This patch introduces __count_free_nids/try_to_free_nids and registers
them in slab shrinker for shrinking under memory pressure.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h     |  1 +
 fs/f2fs/node.c     | 28 ++++++++++++++++++++++++++++
 fs/f2fs/segment.c  |  3 +++
 fs/f2fs/shrinker.c | 14 ++++++++++++++
 4 files changed, 46 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cc07b1595a92..23bfc0ccaf10 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1681,6 +1681,7 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
+int try_to_free_nids(struct f2fs_sb_info *, int);
 void recover_inline_xattr(struct inode *, struct page *);
 void recover_xattr_data(struct inode *, struct page *, block_t);
 int recover_inode_page(struct f2fs_sb_info *, struct page *);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ac9110788b17..6e10c2a08ec6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1635,6 +1635,34 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 		kmem_cache_free(free_nid_slab, i);
 }
 
+int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i, *next;
+	int nr = nr_shrink;
+
+	if (!mutex_trylock(&nm_i->build_lock))
+		return 0;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
+		if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK)
+			break;
+		if (i->state == NID_ALLOC)
+			continue;
+		__del_from_free_nid_list(nm_i, i);
+		nm_i->fcnt--;
+		spin_unlock(&nm_i->free_nid_list_lock);
+		kmem_cache_free(free_nid_slab, i);
+		nr_shrink--;
+		spin_lock(&nm_i->free_nid_list_lock);
+	}
+	spin_unlock(&nm_i->free_nid_list_lock);
+	mutex_unlock(&nm_i->build_lock);
+
+	return nr - nr_shrink;
+}
+
 void recover_inline_xattr(struct inode *inode, struct page *page)
 {
 	void *src_addr, *dst_addr;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bf1605dbce93..1b4265639f07 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -310,6 +310,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	if (!available_free_memory(sbi, NAT_ENTRIES))
 		try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
 
+	if (!available_free_memory(sbi, FREE_NIDS))
+		try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES);
+
 	/* checkpoint is the only way to shrink partial cached entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 9aa4235cd304..da0d8e0b55a5 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -23,6 +23,13 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 	return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
 }
 
+static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
+{
+	if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK)
+		return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK;
+	return 0;
+}
+
 static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
 {
 	return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node);
@@ -53,6 +60,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		/* shrink clean nat cache entries */
 		count += __count_nat_entries(sbi);
 
+		/* count free nids cache entries */
+		count += __count_free_nids(sbi);
+
 		spin_lock(&f2fs_list_lock);
 		p = p->next;
 		mutex_unlock(&sbi->umount_mutex);
@@ -97,6 +107,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 		if (freed < nr)
 			freed += try_to_free_nats(sbi, nr - freed);
 
+		/* shrink free nids cache entries */
+		if (freed < nr)
+			freed += try_to_free_nids(sbi, nr - freed);
+
 		spin_lock(&f2fs_list_lock);
 		p = p->next;
 		list_move_tail(&sbi->s_list, &f2fs_list);
-- 
cgit v1.2.3


From 798c1b16d1a6171587ff46c74ede8092e66f72f7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 11 Aug 2015 21:59:49 -0700
Subject: f2fs: skip checkpoint if there is no dirty and prefree segments

We should avoid needless checkpoints when there is no dirty and prefree segment.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fcb263af58b3..81de28d8326f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -792,7 +792,8 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 
 int f2fs_gc(struct f2fs_sb_info *sbi)
 {
-	unsigned int segno, i;
+	unsigned int segno = NULL_SEGNO;
+	unsigned int i;
 	int gc_type = BG_GC;
 	int nfree = 0;
 	int ret = -1;
@@ -811,10 +812,11 @@ gc_more:
 
 	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
 		gc_type = FG_GC;
-		write_checkpoint(sbi, &cpc);
+		if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
+			write_checkpoint(sbi, &cpc);
 	}
 
-	if (!__get_victim(sbi, &segno, gc_type))
+	if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
 		goto stop;
 	ret = 0;
 
-- 
cgit v1.2.3


From a6db67f06fd9f6b1ddb11bcf4d7e8e8a86908d01 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 10 Aug 2015 15:01:12 -0700
Subject: f2fs: increase the number of max hard links

This patch increases the number of maximum hard links for one file.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 23bfc0ccaf10..830848836da5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -321,7 +321,7 @@ enum {
 					 */
 };
 
-#define F2FS_LINK_MAX		32000	/* maximum link count per file */
+#define F2FS_LINK_MAX	0xffffffff	/* maximum link count per file */
 
 #define MAX_DIR_RA_PAGES	4	/* maximum ra pages of dir */
 
-- 
cgit v1.2.3


From 740432f835608d11b5386321ab5aa8f61e07fb27 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Aug 2015 11:43:56 -0700
Subject: f2fs: handle failed bio allocation

As the below comment of bio_alloc_bioset, f2fs can allocate multiple bios at the
same time. So, we can't guarantee that bio is allocated all the time.

"
 *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
 *   able to allocate a bio. This is due to the mempool guarantees. To make this
 *   work, callers must never allocate more than 1 bio at a time from this pool.
 *   Callers that need to allocate more than 1 bio must always submit the
 *   previously allocated bio for IO before attempting to allocate a new one.
 *   Failure to do so can cause deadlocks under memory pressure.
"

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    |  3 +--
 fs/f2fs/f2fs.h    | 15 +++++++++++++++
 fs/f2fs/segment.c | 15 ++++++++++++---
 3 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cad9ebe45692..726e58b76295 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -90,8 +90,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 {
 	struct bio *bio;
 
-	/* No failure on bio allocation */
-	bio = bio_alloc(GFP_NOIO, npages);
+	bio = f2fs_bio_alloc(npages);
 
 	bio->bi_bdev = sbi->sb->s_bdev;
 	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 830848836da5..00591f725744 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,6 +19,7 @@
 #include <linux/magic.h>
 #include <linux/kobject.h>
 #include <linux/sched.h>
+#include <linux/bio.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
@@ -1253,6 +1254,20 @@ retry:
 	return entry;
 }
 
+static inline struct bio *f2fs_bio_alloc(int npages)
+{
+	struct bio *bio;
+
+	/* No failure on bio allocation */
+retry:
+	bio = bio_alloc(GFP_NOIO, npages);
+	if (!bio) {
+		cond_resched();
+		goto retry;
+	}
+	return bio;
+}
+
 static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
 				unsigned long index, void *item)
 {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1b4265639f07..6273e2cde93e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -330,10 +330,12 @@ repeat:
 		return 0;
 
 	if (!llist_empty(&fcc->issue_list)) {
-		struct bio *bio = bio_alloc(GFP_NOIO, 0);
+		struct bio *bio;
 		struct flush_cmd *cmd, *next;
 		int ret;
 
+		bio = f2fs_bio_alloc(0);
+
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
@@ -365,8 +367,15 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 	if (test_opt(sbi, NOBARRIER))
 		return 0;
 
-	if (!test_opt(sbi, FLUSH_MERGE))
-		return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+	if (!test_opt(sbi, FLUSH_MERGE)) {
+		struct bio *bio = f2fs_bio_alloc(0);
+		int ret;
+
+		bio->bi_bdev = sbi->sb->s_bdev;
+		ret = submit_bio_wait(WRITE_FLUSH, bio);
+		bio_put(bio);
+		return ret;
+	}
 
 	init_completion(&cmd.wait);
 
-- 
cgit v1.2.3


From 26d5859974bb817f7615be90199a8e82e3f0a0ed Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Aug 2015 14:37:50 -0700
Subject: f2fs: avoid garbage collecting already moved node blocks

If node blocks were already moved, we don't need to move them again.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 81de28d8326f..0a5d573e2574 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -396,14 +396,18 @@ static void gc_node_segment(struct f2fs_sb_info *sbi,
 {
 	bool initial = true;
 	struct f2fs_summary *entry;
+	block_t start_addr;
 	int off;
 
+	start_addr = START_BLOCK(sbi, segno);
+
 next_step:
 	entry = sum;
 
 	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
 		nid_t nid = le32_to_cpu(entry->nid);
 		struct page *node_page;
+		struct node_info ni;
 
 		/* stop BG_GC if there is not enough free sections. */
 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
@@ -426,6 +430,12 @@ next_step:
 			continue;
 		}
 
+		get_node_info(sbi, nid, &ni);
+		if (ni.blk_addr != start_addr + off) {
+			f2fs_put_page(node_page, 1);
+			continue;
+		}
+
 		/* set page dirty and write it */
 		if (gc_type == FG_GC) {
 			f2fs_wait_on_page_writeback(node_page, NODE);
-- 
cgit v1.2.3


From 268344664603706b6f156548f9d7482665222f87 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Aug 2015 17:57:29 -0700
Subject: f2fs: reuse nids more aggressively

If we can reuse nids as many as possible, we can mitigate producing obsolete
node pages in the page cache.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 6e10c2a08ec6..3cc32b8f8204 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -306,6 +306,10 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 		unsigned char version = nat_get_version(e);
 		nat_set_version(e, inc_node_version(version));
+
+		/* in order to reuse the nid */
+		if (nm_i->next_scan_nid > ni->nid)
+			nm_i->next_scan_nid = ni->nid;
 	}
 
 	/* change address */
-- 
cgit v1.2.3


From 2286c0205d1478d4bece6e733cbaf15535fba09d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 15 Aug 2015 21:51:05 -0700
Subject: f2fs: fix to cover lock_op for update_inode_page

Previously, update_inode_page is not called under f2fs_lock_op.
Instead we should call with f2fs_write_inode.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 016ed3ba2ca4..7faafb5043e0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -206,8 +206,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 
 	/* if the inode is dirty, let's recover all the time */
-	if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) {
-		update_inode_page(inode);
+	if (!datasync) {
+		f2fs_write_inode(inode, NULL);
 		goto go_write;
 	}
 
-- 
cgit v1.2.3


From 5ee5293c3290a8e710d75977418f954e62c3dfdf Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 15 Aug 2015 22:06:08 -0700
Subject: f2fs: retry gc if one section is not successfully reclaimed

If FG_GC failed to reclaim one section, let's retry with another section
from the start, since we can get anoterh good candidate.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 0a5d573e2574..782b8e72c094 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -391,7 +391,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
  * On validity, copy that node with cold status, otherwise (invalid node)
  * ignore that.
  */
-static void gc_node_segment(struct f2fs_sb_info *sbi,
+static int gc_node_segment(struct f2fs_sb_info *sbi,
 		struct f2fs_summary *sum, unsigned int segno, int gc_type)
 {
 	bool initial = true;
@@ -411,7 +411,7 @@ next_step:
 
 		/* stop BG_GC if there is not enough free sections. */
 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
-			return;
+			return 0;
 
 		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
@@ -461,13 +461,11 @@ next_step:
 		};
 		sync_node_pages(sbi, 0, &wbc);
 
-		/*
-		 * In the case of FG_GC, it'd be better to reclaim this victim
-		 * completely.
-		 */
-		if (get_valid_blocks(sbi, segno, 1) != 0)
-			goto next_step;
+		/* return 1 only if FG_GC succefully reclaimed one */
+		if (get_valid_blocks(sbi, segno, 1) == 0)
+			return 1;
 	}
+	return 0;
 }
 
 /*
@@ -649,7 +647,7 @@ out:
  * If the parent node is not valid or the data block address is different,
  * the victim data block is ignored.
  */
-static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
 {
 	struct super_block *sb = sbi->sb;
@@ -672,7 +670,7 @@ next_step:
 
 		/* stop BG_GC if there is not enough free sections. */
 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
-			return;
+			return 0;
 
 		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
@@ -737,15 +735,11 @@ next_step:
 	if (gc_type == FG_GC) {
 		f2fs_submit_merged_bio(sbi, DATA, WRITE);
 
-		/*
-		 * In the case of FG_GC, it'd be better to reclaim this victim
-		 * completely.
-		 */
-		if (get_valid_blocks(sbi, segno, 1) != 0) {
-			phase = 2;
-			goto next_step;
-		}
+		/* return 1 only if FG_GC succefully reclaimed one */
+		if (get_valid_blocks(sbi, segno, 1) == 0)
+			return 1;
 	}
+	return 0;
 }
 
 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -761,12 +755,13 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 	return ret;
 }
 
-static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 				struct gc_inode_list *gc_list, int gc_type)
 {
 	struct page *sum_page;
 	struct f2fs_summary_block *sum;
 	struct blk_plug plug;
+	int nfree = 0;
 
 	/* read segment summary of victim */
 	sum_page = get_sum_page(sbi, segno);
@@ -786,10 +781,11 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 
 	switch (GET_SUM_TYPE((&sum->footer))) {
 	case SUM_TYPE_NODE:
-		gc_node_segment(sbi, sum->entries, segno, gc_type);
+		nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
 		break;
 	case SUM_TYPE_DATA:
-		gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type);
+		nfree = gc_data_segment(sbi, sum->entries, gc_list,
+							segno, gc_type);
 		break;
 	}
 	blk_finish_plug(&plug);
@@ -798,6 +794,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 	stat_inc_call_count(sbi->stat_info);
 
 	f2fs_put_page(sum_page, 0);
+	return nfree;
 }
 
 int f2fs_gc(struct f2fs_sb_info *sbi)
@@ -836,13 +833,10 @@ gc_more:
 								META_SSA);
 
 	for (i = 0; i < sbi->segs_per_sec; i++)
-		do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
+		nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
 
-	if (gc_type == FG_GC) {
+	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;
-		nfree++;
-		WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
-	}
 
 	if (has_not_enough_free_secs(sbi, nfree))
 		goto gc_more;
-- 
cgit v1.2.3


From a21c20f0c812925085204fced932ac95f2a76bf0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sun, 16 Aug 2015 12:38:15 -0700
Subject: f2fs: go out for insert_inode_locked failure

We should not call unlock_new_inode when insert_inode_locked failed.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 97e97c41b979..a680bf38e4f0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -53,7 +53,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (err) {
 		err = -EINVAL;
 		nid_free = true;
-		goto out;
+		goto fail;
 	}
 
 	/* If the directory encrypted, then we should encrypt the inode. */
@@ -75,9 +75,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	mark_inode_dirty(inode);
 	return inode;
 
-out:
-	clear_nlink(inode);
-	unlock_new_inode(inode);
 fail:
 	trace_f2fs_new_inode(inode, err);
 	make_bad_inode(inode);
-- 
cgit v1.2.3


From 24928634f81b1592e83b37dcd89ed45c28f12feb Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sun, 16 Aug 2015 13:04:50 -0700
Subject: f2fs: check the node block address of newly allocated nid

This patch adds a routine which checks the block address of newly allocated nid.
If an nid has already allocated by other thread due to subtle data races, it
will result in filesystem corruption.
So, it needs to check whether its block address was already allocated or not
in prior to nid allocation as the last chance.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3cc32b8f8204..6bef5a2788b4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1573,6 +1573,8 @@ retry:
 
 	/* We should not use stale free nids created by build_free_nids */
 	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
+		struct node_info ni;
+
 		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
 		list_for_each_entry(i, &nm_i->free_nid_list, list)
 			if (i->state == NID_NEW)
@@ -1583,6 +1585,13 @@ retry:
 		i->state = NID_ALLOC;
 		nm_i->fcnt--;
 		spin_unlock(&nm_i->free_nid_list_lock);
+
+		/* check nid is allocated already */
+		get_node_info(sbi, *nid, &ni);
+		if (ni.blk_addr != NULL_ADDR) {
+			alloc_nid_done(sbi, *nid);
+			goto retry;
+		}
 		return true;
 	}
 	spin_unlock(&nm_i->free_nid_list_lock);
-- 
cgit v1.2.3


From 217940d4f0c4ec4f0852f7046fa419d0edf65c17 Mon Sep 17 00:00:00 2001
From: Junesung Lee <junesoung412@gmail.com>
Date: Tue, 18 Aug 2015 22:42:15 +0900
Subject: f2fs: fix typo

Fix typo.

Signed-off-by: Junesung Lee <junesoung412@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index c629762005bc..b0a9dc929f88 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -45,7 +45,7 @@ config F2FS_FS_POSIX_ACL
 	default y
 	help
 	  Posix Access Control Lists (ACLs) support permissions for users and
-	  gourps beyond the owner/group/world scheme.
+	  groups beyond the owner/group/world scheme.
 
 	  To learn more about Access Control Lists, visit the POSIX ACLs for
 	  Linux website <http://acl.bestbits.at/>.
-- 
cgit v1.2.3


From f8b703da2c23f9bfda7299bd14e4f7201c2be3c8 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Tue, 18 Aug 2015 17:13:13 +0800
Subject: f2fs: fix to update cached_en of extent tree properly

In f2fs_lookup_extent_tree, et->cached_en was read and updated with only
read lock held,
it could cause __lookup_extent_tree within return entirely wrong
extent_node, if other
thread update et->cached_en just before __lookup_extent_tree return.

However, there are two things about this patch that need to be noticed:
1. It does no good to arrange the order of concurrent read/write, the result
would still
be random in such case.
2. It's built on this assumption: the mix up of reads and writes on a single
pointer would
not make the pointer partially wrong at any time. Please let me know if I'm
wrong, thx.

Signed-off-by: Fan li <fanofcode.li@samsung.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 32fae8ad5b7e..cea581353bc2 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -85,13 +85,13 @@ static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
 							unsigned int fofs)
 {
 	struct rb_node *node = et->root.rb_node;
-	struct extent_node *en;
+	struct extent_node *en = et->cached_en;
 
-	if (et->cached_en) {
-		struct extent_info *cei = &et->cached_en->ei;
+	if (en) {
+		struct extent_info *cei = &en->ei;
 
 		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
-			return et->cached_en;
+			return en;
 	}
 
 	while (node) {
-- 
cgit v1.2.3


From e2b4e2bc8865e03eecd49caa9713a2402a96bba9 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:11:19 +0800
Subject: f2fs: fix incorrect mapping for bmap

The test step is like below:
1. touch file
2. truncate -s $((1024*1024)) file
3. fallocate -o 0 -l $((1024*1024)) file
4. fibmap.f2fs file

Our result of fibmap.f2fs showed below is not correct:

file_pos   start_blk     end_blk        blks
       0    -937166132    -937166132           1
    4096    -937166132    -937166132           1
    8192    -937166132    -937166132           1
   12288    -937166132    -937166132           1
   16384    -937166132    -937166132           1
   20480    -937166132    -937166132           1
...
 1040384    -937166132    -937166132           1
 1044480    -937166132    -937166132           1

This is because f2fs_map_blocks will return with no error when meeting
a hole or preallocated block, the caller __get_data_block will map the
uninitialized variable value to bh->b_blocknr.

Unfortunately generic_block_bmap will neither check the return value of
get_data() nor check mapping info of buffer_head, result in returning
the random block address.

After fixing the issue, our result shows correctly:

file_pos   start_blk     end_blk        blks
       0           0           0         256

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 48 ++++++++++++++++++++++++++++++++++++------------
 fs/f2fs/f2fs.h |  6 ++++++
 2 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 726e58b76295..73713bbd4646 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -561,7 +561,7 @@ out:
  *     c. give the block addresses to blockdev
  */
 static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
-			int create, bool fiemap)
+						int create, int flag)
 {
 	unsigned int maxblocks = map->m_len;
 	struct dnode_of_data dn;
@@ -595,8 +595,19 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 			err = 0;
 		goto unlock_out;
 	}
-	if (dn.data_blkaddr == NEW_ADDR && !fiemap)
-		goto put_out;
+	if (dn.data_blkaddr == NEW_ADDR) {
+		if (flag == F2FS_GET_BLOCK_BMAP) {
+			err = -ENOENT;
+			goto put_out;
+		} else if (flag == F2FS_GET_BLOCK_READ ||
+				flag == F2FS_GET_BLOCK_DIO) {
+			goto put_out;
+		}
+		/*
+		 * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP),
+		 * mark it as mapped and unwritten block.
+		 */
+	}
 
 	if (dn.data_blkaddr != NULL_ADDR) {
 		map->m_flags = F2FS_MAP_MAPPED;
@@ -611,6 +622,8 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 		map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
 		map->m_pblk = dn.data_blkaddr;
 	} else {
+		if (flag == F2FS_GET_BLOCK_BMAP)
+			err = -ENOENT;
 		goto put_out;
 	}
 
@@ -633,7 +646,9 @@ get_next:
 				err = 0;
 			goto unlock_out;
 		}
-		if (dn.data_blkaddr == NEW_ADDR && !fiemap)
+
+		if (dn.data_blkaddr == NEW_ADDR &&
+				flag != F2FS_GET_BLOCK_FIEMAP)
 			goto put_out;
 
 		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
@@ -675,7 +690,7 @@ out:
 }
 
 static int __get_data_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh, int create, bool fiemap)
+			struct buffer_head *bh, int create, int flag)
 {
 	struct f2fs_map_blocks map;
 	int ret;
@@ -683,7 +698,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	ret = f2fs_map_blocks(inode, &map, create, fiemap);
+	ret = f2fs_map_blocks(inode, &map, create, flag);
 	if (!ret) {
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
@@ -693,15 +708,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 }
 
 static int get_data_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create, int flag)
+{
+	return __get_data_block(inode, iblock, bh_result, create, flag);
+}
+
+static int get_data_block_dio(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
-	return __get_data_block(inode, iblock, bh_result, create, false);
+	return __get_data_block(inode, iblock, bh_result, create,
+						F2FS_GET_BLOCK_DIO);
 }
 
-static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
+static int get_data_block_bmap(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
-	return __get_data_block(inode, iblock, bh_result, create, true);
+	return __get_data_block(inode, iblock, bh_result, create,
+						F2FS_GET_BLOCK_BMAP);
 }
 
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -745,7 +768,8 @@ next:
 	memset(&map_bh, 0, sizeof(struct buffer_head));
 	map_bh.b_size = len;
 
-	ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0);
+	ret = get_data_block(inode, start_blk, &map_bh, 0,
+					F2FS_GET_BLOCK_FIEMAP);
 	if (ret)
 		goto out;
 
@@ -1530,7 +1554,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	if (iov_iter_rw(iter) == WRITE)
 		__allocate_data_blocks(inode, offset, count);
 
-	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block);
+	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
 	if (err < 0 && iov_iter_rw(iter) == WRITE)
 		f2fs_write_failed(mapping, offset + count);
 
@@ -1618,7 +1642,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 		if (err)
 			return err;
 	}
-	return generic_block_bmap(mapping, block, get_data_block);
+	return generic_block_bmap(mapping, block, get_data_block_bmap);
 }
 
 const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 00591f725744..51dfa8fcc505 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -375,6 +375,12 @@ struct f2fs_map_blocks {
 	unsigned int m_flags;
 };
 
+/* for flag in get_data_block */
+#define F2FS_GET_BLOCK_READ		0
+#define F2FS_GET_BLOCK_DIO		1
+#define F2FS_GET_BLOCK_FIEMAP		2
+#define F2FS_GET_BLOCK_BMAP		3
+
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
  */
-- 
cgit v1.2.3


From 91c481fff92c705dd382f1f53c01e6b6b88507d0 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:12:20 +0800
Subject: f2fs: add largest/cached stat in extent cache

This patch adds to stat the hit count of largest/cached node for showing
in debugfs.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c        |  9 +++++++--
 fs/f2fs/extent_cache.c | 14 +++++++++-----
 fs/f2fs/f2fs.h         |  8 +++++++-
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index bc215fd6c402..1a1a4c67a9bf 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,6 +33,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	int i;
 
 	/* validation check of the segment numbers */
+	si->hit_largest = atomic_read(&sbi->read_hit_largest);
+	si->hit_cached = atomic_read(&sbi->read_hit_cached);
 	si->hit_ext = atomic_read(&sbi->read_hit_ext);
 	si->total_ext = atomic_read(&sbi->total_hit_ext);
 	si->ext_tree = sbi->total_ext_tree;
@@ -279,8 +281,9 @@ static int stat_show(struct seq_file *s, void *v)
 				si->bg_data_blks);
 		seq_printf(s, "  - node blocks : %d (%d)\n", si->node_blks,
 				si->bg_node_blks);
-		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
-			   si->hit_ext, si->total_ext);
+		seq_printf(s, "\nExtent Hit Ratio: L1-1:%d L1-2:%d L2:%d / %d\n",
+				si->hit_largest, si->hit_cached,
+				si->hit_ext, si->total_ext);
 		seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
 		seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
@@ -371,6 +374,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 
 	atomic_set(&sbi->total_hit_ext, 0);
 	atomic_set(&sbi->read_hit_ext, 0);
+	atomic_set(&sbi->read_hit_largest, 0);
+	atomic_set(&sbi->read_hit_cached, 0);
 
 	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index cea581353bc2..5cf217faed1f 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -81,8 +81,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	return et;
 }
 
-static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
-							unsigned int fofs)
+static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, unsigned int fofs)
 {
 	struct rb_node *node = et->root.rb_node;
 	struct extent_node *en = et->cached_en;
@@ -90,8 +90,10 @@ static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
 	if (en) {
 		struct extent_info *cei = &en->ei;
 
-		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
+			stat_inc_cached_node_hit(sbi);
 			return en;
+		}
 	}
 
 	while (node) {
@@ -280,10 +282,11 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		*ei = et->largest;
 		ret = true;
 		stat_inc_read_hit(sbi);
+		stat_inc_largest_node_hit(sbi);
 		goto out;
 	}
 
-	en = __lookup_extent_tree(et, pgofs);
+	en = __lookup_extent_tree(sbi, et, pgofs);
 	if (en) {
 		*ei = en->ei;
 		spin_lock(&sbi->extent_lock);
@@ -313,7 +316,8 @@ out:
  * tree must stay unchanged between lookup and insertion.
  */
 static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
-				unsigned int fofs, struct extent_node **prev_ex,
+				unsigned int fofs,
+				struct extent_node **prev_ex,
 				struct extent_node **next_ex,
 				struct rb_node ***insert_p,
 				struct rb_node **insert_parent)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 51dfa8fcc505..de20387ae225 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -789,6 +789,8 @@ struct f2fs_sb_info {
 	atomic_t inplace_count;		/* # of inplace update */
 	atomic_t total_hit_ext;			/* # of lookup extent cache */
 	atomic_t read_hit_ext;			/* # of hit extent cache */
+	atomic_t read_hit_largest;		/* # of hit largest extent node */
+	atomic_t read_hit_cached;		/* # of hit cached extent node */
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
@@ -1824,7 +1826,7 @@ struct f2fs_stat_info {
 	struct f2fs_sb_info *sbi;
 	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
 	int main_area_segs, main_area_sections, main_area_zones;
-	int hit_ext, total_ext, ext_tree, ext_node;
+	int hit_largest, hit_cached, hit_ext, total_ext, ext_tree, ext_node;
 	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
 	int nats, dirty_nats, sits, dirty_sits, fnids;
 	int total_count, utilization;
@@ -1862,6 +1864,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sbi)		(atomic_inc(&(sbi)->total_hit_ext))
 #define stat_inc_read_hit(sbi)		(atomic_inc(&(sbi)->read_hit_ext))
+#define stat_inc_largest_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_largest))
+#define stat_inc_cached_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_cached))
 #define stat_inc_inline_xattr(inode)					\
 	do {								\
 		if (f2fs_has_inline_xattr(inode))			\
@@ -1942,6 +1946,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
 #define stat_inc_read_hit(sb)
+#define stat_inc_largest_node_hit(sbi)
+#define stat_inc_cached_node_hit(sbi)
 #define stat_inc_inline_xattr(inode)
 #define stat_dec_inline_xattr(inode)
 #define stat_inc_inline_inode(inode)
-- 
cgit v1.2.3


From 029e13cc3221be4bc46909225142277fee52c37e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:13:25 +0800
Subject: f2fs: adjust showing of extent cache stat

This patch alters to replace total hit stat with rbtree hit stat,
and then adjust showing of extent cache stat:

Hit Count:
L1-1: for largest node hit count;
L1-2: for last cached node hit count;
L2: for extent node hit after lookuping in rbtree.

Hit Ratio:
ratio (hit count / total lookup count)

Inner Struct Count:
tree count, node count.

Before:
Extent Hit Ratio: 0 / 2

Extent Tree Count: 3

Extent Node Count: 2

Patched:
Exten Cacache:
  - Hit Count: L1-1:4871 L1-2:2074 L2:208
  - Hit Ratio: 1% (7153 / 550751)
  - Inner Struct Count: tree: 26560, node: 11824

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c        | 18 ++++++++++++------
 fs/f2fs/extent_cache.c | 10 +++++-----
 fs/f2fs/f2fs.h         |  9 +++++----
 3 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 1a1a4c67a9bf..d013d8479753 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -35,7 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	/* validation check of the segment numbers */
 	si->hit_largest = atomic_read(&sbi->read_hit_largest);
 	si->hit_cached = atomic_read(&sbi->read_hit_cached);
-	si->hit_ext = atomic_read(&sbi->read_hit_ext);
+	si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree);
+	si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
 	si->total_ext = atomic_read(&sbi->total_hit_ext);
 	si->ext_tree = sbi->total_ext_tree;
 	si->ext_node = atomic_read(&sbi->total_ext_node);
@@ -281,11 +282,16 @@ static int stat_show(struct seq_file *s, void *v)
 				si->bg_data_blks);
 		seq_printf(s, "  - node blocks : %d (%d)\n", si->node_blks,
 				si->bg_node_blks);
-		seq_printf(s, "\nExtent Hit Ratio: L1-1:%d L1-2:%d L2:%d / %d\n",
+		seq_puts(s, "\nExtent Cache:\n");
+		seq_printf(s, "  - Hit Count: L1-1:%d L1-2:%d L2:%d\n",
 				si->hit_largest, si->hit_cached,
-				si->hit_ext, si->total_ext);
-		seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
-		seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
+				si->hit_rbtree);
+		seq_printf(s, "  - Hit Ratio: %d%% (%d / %d)\n",
+				!si->total_ext ? 0 :
+				(si->hit_total * 100) / si->total_ext,
+				si->hit_total, si->total_ext);
+		seq_printf(s, "  - Inner Struct Count: tree: %d, node: %d\n",
+				si->ext_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - inmem: %4d, wb: %4d\n",
 			   si->inmem_pages, si->wb_pages);
@@ -373,7 +379,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	sbi->stat_info = si;
 
 	atomic_set(&sbi->total_hit_ext, 0);
-	atomic_set(&sbi->read_hit_ext, 0);
+	atomic_set(&sbi->read_hit_rbtree, 0);
 	atomic_set(&sbi->read_hit_largest, 0);
 	atomic_set(&sbi->read_hit_cached, 0);
 
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 5cf217faed1f..d11735aa3cac 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -99,12 +99,14 @@ static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
 	while (node) {
 		en = rb_entry(node, struct extent_node, rb_node);
 
-		if (fofs < en->ei.fofs)
+		if (fofs < en->ei.fofs) {
 			node = node->rb_left;
-		else if (fofs >= en->ei.fofs + en->ei.len)
+		} else if (fofs >= en->ei.fofs + en->ei.len) {
 			node = node->rb_right;
-		else
+		} else {
+			stat_inc_rbtree_node_hit(sbi);
 			return en;
+		}
 	}
 	return NULL;
 }
@@ -281,7 +283,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 			et->largest.fofs + et->largest.len > pgofs) {
 		*ei = et->largest;
 		ret = true;
-		stat_inc_read_hit(sbi);
 		stat_inc_largest_node_hit(sbi);
 		goto out;
 	}
@@ -295,7 +296,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		et->cached_en = en;
 		spin_unlock(&sbi->extent_lock);
 		ret = true;
-		stat_inc_read_hit(sbi);
 	}
 out:
 	stat_inc_total_hit(sbi);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index de20387ae225..66410178aba1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -788,7 +788,7 @@ struct f2fs_sb_info {
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
 	atomic_t total_hit_ext;			/* # of lookup extent cache */
-	atomic_t read_hit_ext;			/* # of hit extent cache */
+	atomic_t read_hit_rbtree;		/* # of hit rbtree extent node */
 	atomic_t read_hit_largest;		/* # of hit largest extent node */
 	atomic_t read_hit_cached;		/* # of hit cached extent node */
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
@@ -1826,7 +1826,8 @@ struct f2fs_stat_info {
 	struct f2fs_sb_info *sbi;
 	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
 	int main_area_segs, main_area_sections, main_area_zones;
-	int hit_largest, hit_cached, hit_ext, total_ext, ext_tree, ext_node;
+	int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext;
+	int ext_tree, ext_node;
 	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
 	int nats, dirty_nats, sits, dirty_sits, fnids;
 	int total_count, utilization;
@@ -1863,7 +1864,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sbi)		(atomic_inc(&(sbi)->total_hit_ext))
-#define stat_inc_read_hit(sbi)		(atomic_inc(&(sbi)->read_hit_ext))
+#define stat_inc_rbtree_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_rbtree))
 #define stat_inc_largest_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_largest))
 #define stat_inc_cached_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_cached))
 #define stat_inc_inline_xattr(inode)					\
@@ -1945,7 +1946,7 @@ void f2fs_destroy_root_stats(void);
 #define stat_inc_dirty_dir(sbi)
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
-#define stat_inc_read_hit(sb)
+#define stat_inc_rbtree_node_hit(sb)
 #define stat_inc_largest_node_hit(sbi)
 #define stat_inc_cached_node_hit(sbi)
 #define stat_inc_inline_xattr(inode)
-- 
cgit v1.2.3


From a6f7834594a284316b38d0885b2ee1ab47899dbc Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:14:15 +0800
Subject: f2fs: kill dead code in __insert_extent_tree

After commit 0f825ee6e873 ("f2fs: add new interfaces for extent tree"),
f2fs_init_extent_tree becomes the only caller of __insert_extent_tree, and
in f2fs_init_extent_tree, we will only insert extent node in an empty tree,
so __try_{back,front}_merge in __insert_extent_tree will never be called.

This patch removes these dead codes, besides, rename __insert_extent_tree
to __init_extent_tree for readability.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 82 ++++----------------------------------------------
 1 file changed, 6 insertions(+), 76 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index d11735aa3cac..5b6139f57841 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -111,87 +111,17 @@ static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
 	return NULL;
 }
 
-static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_node *en)
-{
-	struct extent_node *prev;
-	struct rb_node *node;
-
-	node = rb_prev(&en->rb_node);
-	if (!node)
-		return NULL;
-
-	prev = rb_entry(node, struct extent_node, rb_node);
-	if (__is_back_mergeable(&en->ei, &prev->ei)) {
-		en->ei.fofs = prev->ei.fofs;
-		en->ei.blk = prev->ei.blk;
-		en->ei.len += prev->ei.len;
-		__detach_extent_node(sbi, et, prev);
-		return prev;
-	}
-	return NULL;
-}
-
-static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_node *en)
-{
-	struct extent_node *next;
-	struct rb_node *node;
-
-	node = rb_next(&en->rb_node);
-	if (!node)
-		return NULL;
-
-	next = rb_entry(node, struct extent_node, rb_node);
-	if (__is_front_mergeable(&en->ei, &next->ei)) {
-		en->ei.len += next->ei.len;
-		__detach_extent_node(sbi, et, next);
-		return next;
-	}
-	return NULL;
-}
-
-static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_info *ei,
-				struct extent_node **den)
+static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei)
 {
 	struct rb_node **p = &et->root.rb_node;
-	struct rb_node *parent = NULL;
 	struct extent_node *en;
 
-	while (*p) {
-		parent = *p;
-		en = rb_entry(parent, struct extent_node, rb_node);
-
-		if (ei->fofs < en->ei.fofs) {
-			if (__is_front_mergeable(ei, &en->ei)) {
-				f2fs_bug_on(sbi, !den);
-				en->ei.fofs = ei->fofs;
-				en->ei.blk = ei->blk;
-				en->ei.len += ei->len;
-				*den = __try_back_merge(sbi, et, en);
-				goto update_out;
-			}
-			p = &(*p)->rb_left;
-		} else if (ei->fofs >= en->ei.fofs + en->ei.len) {
-			if (__is_back_mergeable(ei, &en->ei)) {
-				f2fs_bug_on(sbi, !den);
-				en->ei.len += ei->len;
-				*den = __try_front_merge(sbi, et, en);
-				goto update_out;
-			}
-			p = &(*p)->rb_right;
-		} else {
-			f2fs_bug_on(sbi, 1);
-		}
-	}
-
-	en = __attach_extent_node(sbi, et, ei, parent, p);
+	en = __attach_extent_node(sbi, et, ei, NULL, p);
 	if (!en)
 		return NULL;
-update_out:
-	if (en->ei.len > et->largest.len)
-		et->largest = en->ei;
+
+	et->largest = en->ei;
 	et->cached_en = en;
 	return en;
 }
@@ -255,7 +185,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 	if (et->count)
 		goto out;
 
-	en = __insert_extent_tree(sbi, et, &ei, NULL);
+	en = __init_extent_tree(sbi, et, &ei);
 	if (en) {
 		spin_lock(&sbi->extent_lock);
 		list_add_tail(&en->list, &sbi->extent_list);
-- 
cgit v1.2.3


From ef05e221995057a8588cad675992ca2cb47e9891 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:15:09 +0800
Subject: f2fs: split __insert_extent_tree_ret for readability

This patch splits __insert_extent_tree_ret into __try_merge_extent_node &
__insert_extent_tree for code readability.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 5b6139f57841..ab26728736eb 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -294,29 +294,22 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
 	return NULL;
 }
 
-static struct extent_node *__insert_extent_tree_ret(struct f2fs_sb_info *sbi,
+static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_info *ei,
 				struct extent_node **den,
 				struct extent_node *prev_ex,
-				struct extent_node *next_ex,
-				struct rb_node **insert_p,
-				struct rb_node *insert_parent)
+				struct extent_node *next_ex)
 {
-	struct rb_node **p = &et->root.rb_node;
-	struct rb_node *parent = NULL;
 	struct extent_node *en = NULL;
-	int merged = 0;
 
 	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
-		f2fs_bug_on(sbi, !den);
-		merged = 1;
 		prev_ex->ei.len += ei->len;
 		ei = &prev_ex->ei;
 		en = prev_ex;
 	}
+
 	if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
-		f2fs_bug_on(sbi, !den);
-		if (merged++) {
+		if (en) {
 			__detach_extent_node(sbi, et, prev_ex);
 			*den = prev_ex;
 		}
@@ -325,8 +318,23 @@ static struct extent_node *__insert_extent_tree_ret(struct f2fs_sb_info *sbi,
 		next_ex->ei.len += ei->len;
 		en = next_ex;
 	}
-	if (merged)
-		goto update_out;
+
+	if (en) {
+		if (en->ei.len > et->largest.len)
+			et->largest = en->ei;
+		et->cached_en = en;
+	}
+	return en;
+}
+
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+				struct extent_tree *et, struct extent_info *ei,
+				struct rb_node **insert_p,
+				struct rb_node *insert_parent)
+{
+	struct rb_node **p = &et->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_node *en = NULL;
 
 	if (insert_p && insert_parent) {
 		parent = insert_parent;
@@ -349,7 +357,7 @@ do_insert:
 	en = __attach_extent_node(sbi, et, ei, parent, p);
 	if (!en)
 		return NULL;
-update_out:
+
 	if (en->ei.len > et->largest.len)
 		et->largest = en->ei;
 	et->cached_en = en;
@@ -401,8 +409,7 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
 			set_extent_info(&ei, dei.fofs, dei.blk,
 						fofs - dei.fofs);
-			en1 = __insert_extent_tree_ret(sbi, et, &ei, NULL,
-						NULL, NULL, NULL, NULL);
+			en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL);
 		}
 
 		/* insert right part of split extent into cache */
@@ -410,8 +417,7 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
 			set_extent_info(&ei, fofs + 1,
 				fofs - dei.fofs + dei.blk + 1, endofs - fofs);
-			en2 = __insert_extent_tree_ret(sbi, et, &ei, NULL,
-						NULL, NULL, NULL, NULL);
+			en2 = __insert_extent_tree(sbi, et, &ei, NULL, NULL);
 		}
 	}
 
@@ -419,8 +425,11 @@ update_extent:
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
 		set_extent_info(&ei, fofs, blkaddr, 1);
-		en3 = __insert_extent_tree_ret(sbi, et, &ei, &den,
-				prev_ex, next_ex, insert_p, insert_parent);
+		en3 = __try_merge_extent_node(sbi, et, &ei, &den,
+						prev_ex, next_ex);
+		if (!en3)
+			en3 = __insert_extent_tree(sbi, et, &ei,
+						insert_p, insert_parent);
 
 		/* give up extent_cache, if split and small updates happen */
 		if (dei.len >= 1 &&
-- 
cgit v1.2.3


From dac2ddefe62841efc0b6cdcb0bbf3e3594aa01bf Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 19 Aug 2015 19:16:09 +0800
Subject: f2fs: lookup neighbor extent nodes for merging later

In __lookup_extent_tree_ret we will not try to find neighbor nodes if
we find the target node, in this condition, we will lost the chance to
merge the new mapping with exist extent node later.

So our extent cache of inode will be fragmented after overwrite exist
file, we can see the number of extent node increases intensively in
following test case:

dd if=/dev/zero of=/mnt/f2fs/4m bs=4K count=1024

Extent Cache:
  - Hit Count: L1-1:0 L1-2:0 L2:0
  - Hit Ratio: 0% (0 / 3072)
  - Inner Struct Count: tree: 1, node: 1

dd if=/dev/zero of=/mnt/f2fs/4m bs=4K count=1024 conv=notrunc

Extent Cache:
  - Hit Count: L1-1:2048 L1-2:0 L2:0
  - Hit Ratio: 33% (2048 / 6144)
  - Inner Struct Count: tree: 1, node: 961

This patch fixes to lookup neighbors of target node for further
merging.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ab26728736eb..dcfeb43a5975 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -254,13 +254,21 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
 {
 	struct rb_node **pnode = &et->root.rb_node;
 	struct rb_node *parent = NULL, *tmp_node;
-	struct extent_node *en;
+	struct extent_node *en = et->cached_en;
 
-	if (et->cached_en) {
-		struct extent_info *cei = &et->cached_en->ei;
+	*insert_p = NULL;
+	*insert_parent = NULL;
+	*prev_ex = NULL;
+	*next_ex = NULL;
+
+	if (RB_EMPTY_ROOT(&et->root))
+		return NULL;
+
+	if (en) {
+		struct extent_info *cei = &en->ei;
 
 		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
-			return et->cached_en;
+			goto lookup_neighbors;
 	}
 
 	while (*pnode) {
@@ -272,7 +280,7 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
 		else if (fofs >= en->ei.fofs + en->ei.len)
 			pnode = &(*pnode)->rb_right;
 		else
-			return en;
+			goto lookup_neighbors;
 	}
 
 	*insert_p = pnode;
@@ -290,8 +298,22 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
 		tmp_node = rb_prev(parent);
 	*prev_ex = tmp_node ?
 		rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
-
 	return NULL;
+
+lookup_neighbors:
+	if (fofs == en->ei.fofs) {
+		/* lookup prev node for merging backward later */
+		tmp_node = rb_prev(&en->rb_node);
+		*prev_ex = tmp_node ?
+			rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+	}
+	if (fofs == en->ei.fofs + en->ei.len - 1) {
+		/* lookup next node for merging frontward later */
+		tmp_node = rb_next(&en->rb_node);
+		*next_ex = tmp_node ?
+			rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+	}
+	return en;
 }
 
 static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
-- 
cgit v1.2.3


From 80c545055dc7c1f7f487176fe0aac17896a4b7af Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 20 Aug 2015 08:51:56 -0700
Subject: f2fs: use __GFP_NOFAIL to avoid infinite loop

__GFP_NOFAIL can avoid retrying the whole path of kmem_cache_alloc and
bio_alloc.
And, it also fixes the use cases of GFP_ATOMIC correctly.

Suggested-by: Chao Yu <chao2.yu@samsung.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 21 ++++++++-------------
 fs/f2fs/f2fs.h       | 16 +++++-----------
 fs/f2fs/node.c       |  4 ++--
 fs/f2fs/segment.c    |  2 +-
 4 files changed, 16 insertions(+), 27 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 890e4d4c39d7..c5a38e352a80 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -336,26 +336,18 @@ const struct address_space_operations f2fs_meta_aops = {
 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	struct inode_management *im = &sbi->im[type];
-	struct ino_entry *e;
+	struct ino_entry *e, *tmp;
+
+	tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
 retry:
-	if (radix_tree_preload(GFP_NOFS)) {
-		cond_resched();
-		goto retry;
-	}
+	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
 
 	spin_lock(&im->ino_lock);
-
 	e = radix_tree_lookup(&im->ino_root, ino);
 	if (!e) {
-		e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
-		if (!e) {
-			spin_unlock(&im->ino_lock);
-			radix_tree_preload_end();
-			goto retry;
-		}
+		e = tmp;
 		if (radix_tree_insert(&im->ino_root, ino, e)) {
 			spin_unlock(&im->ino_lock);
-			kmem_cache_free(ino_entry_slab, e);
 			radix_tree_preload_end();
 			goto retry;
 		}
@@ -368,6 +360,9 @@ retry:
 	}
 	spin_unlock(&im->ino_lock);
 	radix_tree_preload_end();
+
+	if (e != tmp)
+		kmem_cache_free(ino_entry_slab, tmp);
 }
 
 static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 66410178aba1..ece5e704dfd0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1252,13 +1252,10 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
 						gfp_t flags)
 {
 	void *entry;
-retry:
-	entry = kmem_cache_alloc(cachep, flags);
-	if (!entry) {
-		cond_resched();
-		goto retry;
-	}
 
+	entry = kmem_cache_alloc(cachep, flags);
+	if (!entry)
+		entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL);
 	return entry;
 }
 
@@ -1267,12 +1264,9 @@ static inline struct bio *f2fs_bio_alloc(int npages)
 	struct bio *bio;
 
 	/* No failure on bio allocation */
-retry:
 	bio = bio_alloc(GFP_NOIO, npages);
-	if (!bio) {
-		cond_resched();
-		goto retry;
-	}
+	if (!bio)
+		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
 	return bio;
 }
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 6bef5a2788b4..777066d29fa8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -159,7 +159,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 
 	head = radix_tree_lookup(&nm_i->nat_set_root, set);
 	if (!head) {
-		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
 
 		INIT_LIST_HEAD(&head->entry_list);
 		INIT_LIST_HEAD(&head->set_list);
@@ -246,7 +246,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
 {
 	struct nat_entry *new;
 
-	new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+	new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
 	f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
 	memset(new, 0, sizeof(struct nat_entry));
 	nat_set_nid(new, nid);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6273e2cde93e..78e6d0696847 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1753,7 +1753,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 static struct sit_entry_set *grab_sit_entry_set(void)
 {
 	struct sit_entry_set *ses =
-			f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+			f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS);
 
 	ses->entry_cnt = 0;
 	INIT_LIST_HEAD(&ses->set_list);
-- 
cgit v1.2.3


From f7409d0fae7a02ea6c8195f75ad73866d5dea617 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 21 Aug 2015 23:37:18 -0700
Subject: f2fs: fix wrong pointer access during try_to_free_nids

If we release the lock in list_for_each_entry_safe, we can lose the tmp
pointer by alloc_nid.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 777066d29fa8..0867325e288f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1664,11 +1664,9 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 		if (i->state == NID_ALLOC)
 			continue;
 		__del_from_free_nid_list(nm_i, i);
-		nm_i->fcnt--;
-		spin_unlock(&nm_i->free_nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
+		nm_i->fcnt--;
 		nr_shrink--;
-		spin_lock(&nm_i->free_nid_list_lock);
 	}
 	spin_unlock(&nm_i->free_nid_list_lock);
 	mutex_unlock(&nm_i->build_lock);
-- 
cgit v1.2.3


From 6a6788576dac56135bf98ad974a038b0afb1a499 Mon Sep 17 00:00:00 2001
From: Zhang Zhen <zhenzhang.zhang@huawei.com>
Date: Mon, 24 Aug 2015 10:41:32 +0800
Subject: f2fs: atomically set inode->i_flags

According to commit 5f16f3225b06 ("ext4: atomically set inode->i_flags in
ext4_set_inode_flags()").

Signed-off-by: Zhang Zhen <zhenzhang.zhang@huawei.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 83354433d4d1..d1b03d01b7e3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -12,7 +12,6 @@
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/bitops.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -34,8 +33,8 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	set_mask_bits(&inode->i_flags,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+	inode_set_flags(inode, new_fl,
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
-- 
cgit v1.2.3


From 4ec17d688d74b6b7cb10043c57ff4818cde2b0ca Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 24 Aug 2015 17:36:25 +0800
Subject: f2fs: avoid unneeded initializing when converting inline dentry

When converting inline dentry, we will zero out target dentry page before
duplicating data of inline dentry into target page, it become overhead
since inline dentry size is not small.

So this patch tries to remove unneeded initializing in the space of target
dentry page.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inline.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 79d18d5c1fae..3d143be42895 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -384,13 +384,21 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 		goto out;
 
 	f2fs_wait_on_page_writeback(page, DATA);
-	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
 
 	dentry_blk = kmap_atomic(page);
 
 	/* copy data from inline dentry block to new dentry block */
 	memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
 					INLINE_DENTRY_BITMAP_SIZE);
+	memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0,
+			SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE);
+	/*
+	 * we do not need to zero out remainder part of dentry and filename
+	 * field, since we have used bitmap for marking the usage status of
+	 * them, besides, we can also ignore copying/zeroing reserved space
+	 * of dentry block, because them haven't been used so far.
+	 */
 	memcpy(dentry_blk->dentry, inline_dentry->dentry,
 			sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
 	memcpy(dentry_blk->filename, inline_dentry->filename,
-- 
cgit v1.2.3


From b01548919c33767bc457390fa3c41aedc273bfff Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 24 Aug 2015 17:39:42 +0800
Subject: f2fs: handle f2fs_truncate error correctly

This patch fixes to return error number of f2fs_truncate, so that we
can handle the error correctly in callers.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h |  2 +-
 fs/f2fs/file.c | 26 +++++++++++++++++---------
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ece5e704dfd0..806439f1c886 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1595,7 +1595,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 int truncate_blocks(struct inode *, u64, bool);
-void f2fs_truncate(struct inode *, bool);
+int f2fs_truncate(struct inode *, bool);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7faafb5043e0..86a5c76eb106 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -579,24 +579,30 @@ out:
 	return err;
 }
 
-void f2fs_truncate(struct inode *inode, bool lock)
+int f2fs_truncate(struct inode *inode, bool lock)
 {
+	int err;
+
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 				S_ISLNK(inode->i_mode)))
-		return;
+		return 0;
 
 	trace_f2fs_truncate(inode);
 
 	/* we should check inline_data size */
 	if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
-		if (f2fs_convert_inline_inode(inode))
-			return;
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
 	}
 
-	if (!truncate_blocks(inode, i_size_read(inode), lock)) {
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		mark_inode_dirty(inode);
-	}
+	err = truncate_blocks(inode, i_size_read(inode), lock);
+	if (err)
+		return err;
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 int f2fs_getattr(struct vfsmount *mnt,
@@ -656,7 +662,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		if (attr->ia_size <= i_size_read(inode)) {
 			truncate_setsize(inode, attr->ia_size);
-			f2fs_truncate(inode, true);
+			err = f2fs_truncate(inode, true);
+			if (err)
+				return err;
 			f2fs_balance_fs(F2FS_I_SB(inode));
 		} else {
 			/*
-- 
cgit v1.2.3


From 13ec7297e5331f2754d7629a068c619c41f20e56 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 24 Aug 2015 17:40:45 +0800
Subject: f2fs: fix to release inode correctly

In following call stack, if unfortunately we lose all chances to truncate
inode page in remove_inode_page, eventually we will add the nid allocated
previously into free nid cache, this nid is with NID_NEW status and with
NEW_ADDR in its blkaddr pointer:

 - f2fs_create
  - f2fs_add_link
   - __f2fs_add_link
    - init_inode_metadata
     - new_inode_page
      - new_node_page
       - set_node_addr(, NEW_ADDR)
     - f2fs_init_acl   failed
     - remove_inode_page  failed
  - handle_failed_inode
   - remove_inode_page  failed
   - iput
    - f2fs_evict_inode
     - remove_inode_page  failed
     - alloc_nid_failed   cache a nid with valid blkaddr: NEW_ADDR

This may not only cause resource leak of previous inode, but also may cause
incorrect use of the previous blkaddr which is located in NO.nid node entry
when this nid is reused by others.

This patch tries to add this inode to orphan list if we fail to truncate
inode, so that we can obtain a second chance to release it in orphan
recovery flow.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |  2 +-
 fs/f2fs/inode.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/f2fs/node.c  | 14 +++++++++-----
 3 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 806439f1c886..69827ee8a0ee 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1687,7 +1687,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-void remove_inode_page(struct inode *);
+int remove_inode_page(struct inode *);
 struct page *new_inode_page(struct inode *);
 struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d1b03d01b7e3..35aae65b3e5d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -317,6 +317,7 @@ void f2fs_evict_inode(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	nid_t xnid = fi->i_xattr_nid;
+	int err = 0;
 
 	/* some remained atomic pages should discarded */
 	if (f2fs_is_atomic_file(inode))
@@ -342,11 +343,13 @@ void f2fs_evict_inode(struct inode *inode)
 	i_size_write(inode, 0);
 
 	if (F2FS_HAS_BLOCKS(inode))
-		f2fs_truncate(inode, true);
+		err = f2fs_truncate(inode, true);
 
-	f2fs_lock_op(sbi);
-	remove_inode_page(inode);
-	f2fs_unlock_op(sbi);
+	if (!err) {
+		f2fs_lock_op(sbi);
+		err = remove_inode_page(inode);
+		f2fs_unlock_op(sbi);
+	}
 
 	sb_end_intwrite(inode->i_sb);
 no_delete:
@@ -362,9 +365,26 @@ no_delete:
 	if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
 		add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
 	if (is_inode_flag_set(fi, FI_FREE_NID)) {
-		alloc_nid_failed(sbi, inode->i_ino);
+		if (err && err != -ENOENT)
+			alloc_nid_done(sbi, inode->i_ino);
+		else
+			alloc_nid_failed(sbi, inode->i_ino);
 		clear_inode_flag(fi, FI_FREE_NID);
 	}
+
+	if (err && err != -ENOENT) {
+		if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) {
+			/*
+			 * get here because we failed to release resource
+			 * of inode previously, reminder our user to run fsck
+			 * for fixing.
+			 */
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			f2fs_msg(sbi->sb, KERN_WARNING,
+				"inode (ino:%lu) resource leak, run fsck "
+				"to fix this issue!", inode->i_ino);
+		}
+	}
 out_clear:
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 	if (fi->i_crypt_info)
@@ -377,6 +397,7 @@ out_clear:
 void handle_failed_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int err = 0;
 
 	clear_nlink(inode);
 	make_bad_inode(inode);
@@ -384,9 +405,27 @@ void handle_failed_inode(struct inode *inode)
 
 	i_size_write(inode, 0);
 	if (F2FS_HAS_BLOCKS(inode))
-		f2fs_truncate(inode, false);
+		err = f2fs_truncate(inode, false);
+
+	if (!err)
+		err = remove_inode_page(inode);
 
-	remove_inode_page(inode);
+	/*
+	 * if we skip truncate_node in remove_inode_page bacause we failed
+	 * before, it's better to find another way to release resource of
+	 * this inode (e.g. valid block count, node block or nid). Here we
+	 * choose to add this inode to orphan list, so that we can call iput
+	 * for releasing in orphan recovery flow.
+	 *
+	 * Note: we should add inode to orphan list before f2fs_unlock_op()
+	 * so we can prevent losing this orphan when encoutering checkpoint
+	 * and following suddenly power-off.
+	 */
+	if (err && err != -ENOENT) {
+		err = acquire_orphan_inode(sbi);
+		if (!err)
+			add_orphan_inode(sbi, inode->i_ino);
+	}
 
 	set_inode_flag(F2FS_I(inode), FI_FREE_NID);
 	f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0867325e288f..27d1a74dd6f3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -902,17 +902,20 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
  * Caller should grab and release a rwsem by calling f2fs_lock_op() and
  * f2fs_unlock_op().
  */
-void remove_inode_page(struct inode *inode)
+int remove_inode_page(struct inode *inode)
 {
 	struct dnode_of_data dn;
+	int err;
 
 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-	if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
-		return;
+	err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+	if (err)
+		return err;
 
-	if (truncate_xattr_node(inode, dn.inode_page)) {
+	err = truncate_xattr_node(inode, dn.inode_page);
+	if (err) {
 		f2fs_put_dnode(&dn);
-		return;
+		return err;
 	}
 
 	/* remove potential inline_data blocks */
@@ -926,6 +929,7 @@ void remove_inode_page(struct inode *inode)
 
 	/* will put inode & node pages */
 	truncate_node(&dn);
+	return 0;
 }
 
 struct page *new_inode_page(struct inode *inode)
-- 
cgit v1.2.3


From 19b2c30d3cce928010138cae4b9e57c388aa065c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 26 Aug 2015 20:34:48 +0800
Subject: f2fs: update extent tree in batches

This patch introduce a new helper f2fs_update_extent_tree_range which can
do extent mapping update at a specified range.

The main idea is:
1) punch all mapping info in extent node(s) which are at a specified range;
2) try to merge new extent mapping with adjacent node, or failing that,
   insert the mapping into extent tree as a new node.

In order to see the benefit, I add a function for stating time stamping
count as below:

uint64_t rdtsc(void)
{
	uint32_t lo, hi;
	__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
	return (uint64_t)hi << 32 | lo;
}

My test environment is: ubuntu, intel i7-3770, 16G memory, 256g micron ssd.

truncation path:	update extent cache from truncate_data_blocks_range
non-truncataion path:	update extent cache from other paths
total:			all update paths

a) Removing 128MB file which has one extent node mapping whole range of
file:
1. dd if=/dev/zero of=/mnt/f2fs/128M bs=1M count=128
2. sync
3. rm /mnt/f2fs/128M

Before:
		total		count		average
truncation:	7651022		32768		233.49

Patched:
		total		count		average
truncation:	3321		33		100.64

b) fsstress:
fsstress -d /mnt/f2fs -l 5 -n 100 -p 20
Test times:		5 times.

Before:
		total		count		average
truncation:	5812480.6	20911.6		277.95
non-truncation:	7783845.6	13440.8		579.12
total:		13596326.2	34352.4		395.79

Patched:
		total		count		average
truncation:	1281283.0	3041.6		421.25
non-truncation:	7355844.4	13662.8		538.38
total:		8637127.4	16704.4		517.06

1) For the updates in truncation path:
 - we can see updating in batches leads total tsc and update count reducing
   explicitly;
 - besides, for a single batched updating, punching multiple extent nodes
   in a loop, result in executing more operations, so our average tsc
   increase intensively.
2) For the updates in non-truncation path:
 - there is a little improvement, that is because for the scenario that we
   just need to update in the head or tail of extent node, new interface
   optimize to update info in extent node directly, rather than removing
   original extent node for updating and then inserting that updated one
   into cache as new node.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 217 +++++++++++++++++++++++++++++++++++--------------
 fs/f2fs/f2fs.h         |   2 +
 fs/f2fs/file.c         |  12 ++-
 3 files changed, 170 insertions(+), 61 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index dcfeb43a5975..e6b245718ef0 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -386,23 +386,21 @@ do_insert:
 	return en;
 }
 
-/* return true, if on-disk extent should be updated */
-static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
-							block_t blkaddr)
+unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+				pgoff_t fofs, block_t blkaddr, unsigned int len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
-	struct extent_node *den = NULL, *prev_ex = NULL, *next_ex = NULL;
+	struct extent_node *prev_en = NULL, *next_en = NULL;
 	struct extent_info ei, dei, prev;
 	struct rb_node **insert_p = NULL, *insert_parent = NULL;
-	unsigned int endofs;
+	unsigned int end = fofs + len;
+	unsigned int pos = (unsigned int)fofs;
 
 	if (!et)
 		return false;
 
-	trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
-
 	write_lock(&et->lock);
 
 	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
@@ -416,39 +414,143 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
 	/* we do not guarantee that the largest extent is cached all the time */
 	f2fs_drop_largest_extent(inode, fofs);
 
-	/* 1. lookup and remove existing extent info in cache */
-	en = __lookup_extent_tree_ret(et, fofs, &prev_ex, &next_ex,
+	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
+	en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
 					&insert_p, &insert_parent);
-	if (!en)
-		goto update_extent;
-
-	dei = en->ei;
-	__detach_extent_node(sbi, et, en);
-
-	/* 2. if extent can be split, try to split it */
-	if (dei.len > F2FS_MIN_EXTENT_LEN) {
-		/*  insert left part of split extent into cache */
-		if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
-			set_extent_info(&ei, dei.fofs, dei.blk,
-						fofs - dei.fofs);
-			en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL);
+	if (!en) {
+		if (next_en) {
+			en = next_en;
+			f2fs_bug_on(sbi, en->ei.fofs <= pos);
+			pos = en->ei.fofs;
+		} else {
+			/*
+			 * skip searching in the tree since there is no
+			 * larger extent node in the cache.
+			 */
+			goto update_extent;
+		}
+	}
+
+	/* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
+	while (en) {
+		struct rb_node *node;
+
+		if (pos >= end)
+			break;
+
+		dei = en->ei;
+		en1 = en2 = NULL;
+
+		node = rb_next(&en->rb_node);
+
+		/*
+		 * 2.1 there are four cases when we invalidate blkaddr in extent
+		 * node, |V: valid address, X: will be invalidated|
+		 */
+		/* case#1, invalidate right part of extent node |VVVVVXXXXX| */
+		if (pos > dei.fofs && end >= dei.fofs + dei.len) {
+			en->ei.len = pos - dei.fofs;
+
+			if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
+				__detach_extent_node(sbi, et, en);
+				insert_p = NULL;
+				insert_parent = NULL;
+				goto update;
+			}
+
+			if (__is_extent_same(&dei, &et->largest))
+				et->largest = en->ei;
+			goto next;
+		}
+
+		/* case#2, invalidate left part of extent node |XXXXXVVVVV| */
+		if (pos <= dei.fofs && end < dei.fofs + dei.len) {
+			en->ei.fofs = end;
+			en->ei.blk += end - dei.fofs;
+			en->ei.len -= end - dei.fofs;
+
+			if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
+				__detach_extent_node(sbi, et, en);
+				insert_p = NULL;
+				insert_parent = NULL;
+				goto update;
+			}
+
+			if (__is_extent_same(&dei, &et->largest))
+				et->largest = en->ei;
+			goto next;
 		}
 
-		/* insert right part of split extent into cache */
-		endofs = dei.fofs + dei.len - 1;
-		if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
-			set_extent_info(&ei, fofs + 1,
-				fofs - dei.fofs + dei.blk + 1, endofs - fofs);
-			en2 = __insert_extent_tree(sbi, et, &ei, NULL, NULL);
+		__detach_extent_node(sbi, et, en);
+
+		/*
+		 * if we remove node in rb-tree, our parent node pointer may
+		 * point the wrong place, discard them.
+		 */
+		insert_p = NULL;
+		insert_parent = NULL;
+
+		/* case#3, invalidate entire extent node |XXXXXXXXXX| */
+		if (pos <= dei.fofs && end >= dei.fofs + dei.len) {
+			if (__is_extent_same(&dei, &et->largest))
+				et->largest.len = 0;
+			goto update;
+		}
+
+		/*
+		 * case#4, invalidate data in the middle of extent node
+		 * |VVVXXXXVVV|
+		 */
+		if (dei.len > F2FS_MIN_EXTENT_LEN) {
+			unsigned int endofs;
+
+			/*  insert left part of split extent into cache */
+			if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+				set_extent_info(&ei, dei.fofs, dei.blk,
+							pos - dei.fofs);
+				en1 = __insert_extent_tree(sbi, et, &ei,
+								NULL, NULL);
+			}
+
+			/* insert right part of split extent into cache */
+			endofs = dei.fofs + dei.len;
+			if (endofs - end >= F2FS_MIN_EXTENT_LEN) {
+				set_extent_info(&ei, end,
+						end - dei.fofs + dei.blk,
+						endofs - end);
+				en2 = __insert_extent_tree(sbi, et, &ei,
+								NULL, NULL);
+			}
 		}
+update:
+		/* 2.2 update in global extent list */
+		spin_lock(&sbi->extent_lock);
+		if (en && !list_empty(&en->list))
+			list_del(&en->list);
+		if (en1)
+			list_add_tail(&en1->list, &sbi->extent_list);
+		if (en2)
+			list_add_tail(&en2->list, &sbi->extent_list);
+		spin_unlock(&sbi->extent_lock);
+
+		/* 2.3 release extent node */
+		if (en)
+			kmem_cache_free(extent_node_slab, en);
+next:
+		en = node ? rb_entry(node, struct extent_node, rb_node) : NULL;
+		next_en = en;
+		if (en)
+			pos = en->ei.fofs;
 	}
 
 update_extent:
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
-		set_extent_info(&ei, fofs, blkaddr, 1);
+		struct extent_node *den = NULL;
+
+		set_extent_info(&ei, fofs, blkaddr, len);
 		en3 = __try_merge_extent_node(sbi, et, &ei, &den,
-						prev_ex, next_ex);
+							prev_en, next_en);
 		if (!en3)
 			en3 = __insert_extent_tree(sbi, et, &ei,
 						insert_p, insert_parent);
@@ -460,36 +562,21 @@ update_extent:
 			et->largest.len = 0;
 			set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
 		}
-	}
 
-	/* 4. update in global extent list */
-	spin_lock(&sbi->extent_lock);
-	if (en && !list_empty(&en->list))
-		list_del(&en->list);
-	/*
-	 * en1 and en2 split from en, they will become more and more smaller
-	 * fragments after splitting several times. So if the length is smaller
-	 * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
-	 */
-	if (en1)
-		list_add_tail(&en1->list, &sbi->extent_list);
-	if (en2)
-		list_add_tail(&en2->list, &sbi->extent_list);
-	if (en3) {
-		if (list_empty(&en3->list))
-			list_add_tail(&en3->list, &sbi->extent_list);
-		else
-			list_move_tail(&en3->list, &sbi->extent_list);
-	}
-	if (den && !list_empty(&den->list))
-		list_del(&den->list);
-	spin_unlock(&sbi->extent_lock);
+		spin_lock(&sbi->extent_lock);
+		if (en3) {
+			if (list_empty(&en3->list))
+				list_add_tail(&en3->list, &sbi->extent_list);
+			else
+				list_move_tail(&en3->list, &sbi->extent_list);
+		}
+		if (den && !list_empty(&den->list))
+			list_del(&den->list);
+		spin_unlock(&sbi->extent_lock);
 
-	/* 5. release extent node */
-	if (en)
-		kmem_cache_free(extent_node_slab, en);
-	if (den)
-		kmem_cache_free(extent_node_slab, den);
+		if (den)
+			kmem_cache_free(extent_node_slab, den);
+	}
 
 	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
 		__free_extent_tree(sbi, et, true);
@@ -645,10 +732,22 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
 
 	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
 
+
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
 							dn->ofs_in_node;
 
-	if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr))
+	if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1))
+		sync_inode_page(dn);
+}
+
+void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
+				pgoff_t fofs, block_t blkaddr, unsigned int len)
+
+{
+	if (!f2fs_may_extent_tree(dn->inode))
+		return;
+
+	if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len))
 		sync_inode_page(dn);
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 69827ee8a0ee..f1a90ffd7cad 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2017,6 +2017,8 @@ unsigned int f2fs_destroy_extent_node(struct inode *);
 void f2fs_destroy_extent_tree(struct inode *);
 bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
 void f2fs_update_extent_cache(struct dnode_of_data *);
+void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
+						pgoff_t, block_t, unsigned int);
 void init_extent_cache_info(struct f2fs_sb_info *);
 int __init create_extent_cache(void);
 void destroy_extent_cache(void);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 86a5c76eb106..8120f8685141 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -445,9 +445,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
-	int nr_free = 0, ofs = dn->ofs_in_node;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_node *raw_node;
+	int nr_free = 0, ofs = dn->ofs_in_node, len = count;
 	__le32 *addr;
 
 	raw_node = F2FS_NODE(dn->node_page);
@@ -460,14 +460,22 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 
 		dn->data_blkaddr = NULL_ADDR;
 		set_data_blkaddr(dn);
-		f2fs_update_extent_cache(dn);
 		invalidate_blocks(sbi, blkaddr);
 		if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
 			clear_inode_flag(F2FS_I(dn->inode),
 						FI_FIRST_BLOCK_WRITTEN);
 		nr_free++;
 	}
+
 	if (nr_free) {
+		pgoff_t fofs;
+		/*
+		 * once we invalidate valid blkaddr in range [ofs, ofs + count],
+		 * we will invalidate all blkaddr in the whole range.
+		 */
+		fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
+						F2FS_I(dn->inode)) + ofs;
+		f2fs_update_extent_cache_range(dn, fofs, 0, len);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
 		set_page_dirty(dn->node_page);
 		sync_inode_page(dn);
-- 
cgit v1.2.3


From 54d71856428961124be26301b7997f2ad23be520 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 28 Aug 2015 18:18:57 +0800
Subject: f2fs: avoid accessing NULL pointer in f2fs_drop_largest_extent

If extent cache is disable, we will encounter oops when triggering direct
IO as below:

BUG: unable to handle kernel NULL pointer dereference at 0000000c
IP: [<f0b9c61e>] f2fs_drop_largest_extent+0xe/0x30 [f2fs]
*pdpt = 000000002bb9a001 *pde = 0000000000000000
Oops: 0000 [#1] SMP
Modules linked in: f2fs(O) fuse bnep rfcomm bluetooth nfsd dm_crypt nfs_acl auth_rpcgss oid_registry nfs binfmt_misc fscache lockd
sunrpc grace snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer
snd_seq_device snd soundcore joydev psmouse hid_generic i2c_piix4 serio_raw ppdev mac_hid parport_pc lp parport ext4 jbd2 mbcache
usbhid hid e1000
CPU: 3 PID: 3608 Comm: dd Tainted: G           O    4.2.0-rc4 #12
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
task: ef161600 ti: ebd5e000 task.ti: ebd5e000
EIP: 0060:[<f0b9c61e>] EFLAGS: 00010202 CPU: 3
EIP is at f2fs_drop_largest_extent+0xe/0x30 [f2fs]
EAX: 00000000 EBX: ddebc000 ECX: 00000000 EDX: 00000000
ESI: ebd5fdf8 EDI: 00000000 EBP: ebd5fd58 ESP: ebd5fd58
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
CR0: 80050033 CR2: 0000000c CR3: 2c24ee40 CR4: 000006f0
Stack:
 ebd5fda4 f0b8c005 00000000 00000001 00000000 f0b8c430 c816cd68 ddebc000
 ddebc088 00001000 00000555 00000555 ffffffff c160bb00 00055501 00000000
 00000000 00000100 00000000 ebd5fe20 f0b8c430 00000046 ef161600 00001000
Call Trace:
 [<f0b8c005>] __allocate_data_block+0x1a5/0x260 [f2fs]
 [<f0b8c430>] ? f2fs_direct_IO+0x370/0x440 [f2fs]
 [<c160bb00>] ? down_read+0x30/0x50
 [<f0b8c430>] f2fs_direct_IO+0x370/0x440 [f2fs]
 [<c113e115>] generic_file_direct_write+0xa5/0x260
 [<c10b53f8>] ? current_fs_time+0x18/0x50
 [<c113e38b>] __generic_file_write_iter+0xbb/0x210
 [<c113e50f>] ? generic_file_write_iter+0x2f/0x320
 [<c113e63c>] generic_file_write_iter+0x15c/0x320
 [<f0b77f29>] f2fs_file_write_iter+0x39/0x80 [f2fs]
 [<c11984d9>] __vfs_write+0xa9/0xe0
 [<c1199227>] vfs_write+0x97/0x180
 [<c119955b>] SyS_write+0x5b/0xd0
 [<c160dcd0>] sysenter_do_call+0x12/0x12
Code: 10 8b 50 1c 89 53 14 eb ca 8d 74 26 00 85 f6 74 86 eb a6 0f 0b 90 8d b4 26 00 00 00 00 55 89 e5 3e 8d 74 26 00 8b 80 d4 02 00
00 <8b> 48 0c 39 d1 77 0e 03 48 14 39 ca 73 07 c7 40 14 00 00 00 00
EIP: [<f0b9c61e>] f2fs_drop_largest_extent+0xe/0x30 [f2fs] SS:ESP 0068:ebd5fd58
CR2: 000000000000000c
---[ end trace a38c07026a1afffd ]---

This is because when extent cache is disable, extent_tree pointer in struct
f2fs_inode_info should be NULL, but in f2fs_drop_largest_extent we access
this NULL pointer directly without checking state of extent cache, then,
the oops occurs. Let's fix it by checking state of extent cache before
accessing.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index e6b245718ef0..997ac86f2a1d 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -155,7 +155,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 	return count - et->count;
 }
 
-void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
+static void __drop_largest_extent(struct inode *inode, pgoff_t fofs)
 {
 	struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
 
@@ -163,6 +163,14 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
 		largest->len = 0;
 }
 
+void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
+{
+	if (!f2fs_may_extent_tree(inode))
+		return;
+
+	__drop_largest_extent(inode, fofs);
+}
+
 void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -412,7 +420,7 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 	dei.len = 0;
 
 	/* we do not guarantee that the largest extent is cached all the time */
-	f2fs_drop_largest_extent(inode, fofs);
+	__drop_largest_extent(inode, fofs);
 
 	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
 	en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
-- 
cgit v1.2.3


From 01a5ad827a36e36f45e1fdb96903ea115f759865 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Mon, 31 Aug 2015 17:15:10 +0800
Subject: f2fs: upset segment_info repair

upset segment_info like this:

276000|161 0|0   4|70  3|0   3|0   0|0   0|91  4|0   4|232 4|39
276104|0   4|0   4|1   4|0   4|0   4|280 4|0   4|42  4|262 4|38
276204|179 4|89  4|39  4|24  4|0   4|96  4|3   4|428 4|0   4|118
276304|112 4|97  4|0   4|0   4|0   4|68  4|0   4|0   4|86  4|138
276404|0   4|0   0|166 5|39  4|101 0|111

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/f2fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index cfe3f9579934..f79478115d37 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -693,7 +693,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
 		struct seg_entry *se = get_seg_entry(sbi, i);
 
 		if ((i % 10) == 0)
-			seq_printf(seq, "%-5d", i);
+			seq_printf(seq, "%-10d", i);
 		seq_printf(seq, "%d|%-3u", se->type,
 					get_valid_blocks(sbi, i, 1));
 		if ((i % 10) == 9 || i == (total_segs - 1))
-- 
cgit v1.2.3