From 92859a5efdfa71f712ec1d213f43061965d3e9b4 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 26 Jun 2015 17:28:55 +0200 Subject: f2fs crypto: delete an unnecessary check before the function call "key_put" The key_put() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Jaegeuk Kim --- fs/f2fs/crypto_key.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 95b8f936f00b..9f77de2ef317 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -92,8 +92,7 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) if (!ci) return; - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); + key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(f2fs_crypt_info_cachep, ci); } -- cgit v1.2.3 From 5ac9f36fcacd532b218db1e0fd0f9e8a18321f22 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 29 Jun 2015 18:14:10 +0800 Subject: f2fs: fix to record dirty page count for symlink Dirty page can be exist in mapping of newly created symlink, but previously we did not maintain the counting of dirty page for symlink like we maintained for regular/directory, so the counting we lookuped should be wrong. This patch adds missed dirty page counting for symlink to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/f2fs.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b70bbe1a6a8c..de7a0d6a371a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -704,7 +704,8 @@ void update_dirty_page(struct inode *inode, struct page *page) struct inode_entry *new; int ret = 0; - if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; if (!S_ISDIR(inode->i_mode)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8327ed73898..516220454a4e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1039,7 +1039,8 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_pages(struct inode *inode) { - if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; atomic_dec(&F2FS_I(inode)->dirty_pages); -- cgit v1.2.3 From eca616f8c1d6c581f3785f0ee3e2a3887e084273 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 15 Jun 2015 14:52:29 -0700 Subject: f2fs: avoid freed stat information The write_checkpoint can update stat information, so we should destroy the stat structure after it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a06b0b46fe69..da277100dc90 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -498,7 +498,6 @@ static void f2fs_put_super(struct super_block *sb) } kobject_del(&sbi->s_kobj); - f2fs_destroy_stats(sbi); stop_gc_thread(sbi); /* @@ -514,6 +513,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* write_checkpoint can update stat informaion */ + f2fs_destroy_stats(sbi); + /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. -- cgit v1.2.3 From c9b63bd01dd8da096d079c490771ad8a049fd480 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 23 Jun 2015 10:36:08 -0700 Subject: f2fs: avoid to use failed inode immediately Before iput is called, the inode number used by a bad inode can be reassigned to other new inode, resulting in any abnormal behaviors on the new inode. This should not happen for the new inode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 19 ++++++++++++------- fs/f2fs/namei.c | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 516220454a4e..3aaa4b99050a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1343,6 +1343,7 @@ enum { FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2550868dc651..757fed253697 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -314,7 +314,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) @@ -334,7 +335,7 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; sb_start_intwrite(inode->i_sb); - set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); + set_inode_flag(fi, FI_NO_ALLOC); i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) @@ -357,14 +358,18 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + if (is_inode_flag_set(fi, FI_APPEND_WRITE)) add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(fi, FI_FREE_NID)) { + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(fi, FI_FREE_NID); + } out_clear: #ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info); + if (fi->i_crypt_info) + f2fs_free_encryption_info(inode, fi->i_crypt_info); #endif clear_inode(inode); } @@ -384,9 +389,9 @@ void handle_failed_inode(struct inode *inode) remove_inode_page(inode); + set_inode_flag(F2FS_I(inode), FI_FREE_NID); clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); - alloc_nid_failed(sbi, inode->i_ino); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fdbae21ee8fb..08656fca8f83 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -78,9 +78,9 @@ out: fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); - iput(inode); if (nid_free) - alloc_nid_failed(sbi, ino); + set_inode_flag(F2FS_I(inode), FI_FREE_NID); + iput(inode); return ERR_PTR(err); } -- cgit v1.2.3 From 97a7b2c274d5dbe51170e099c16d49cfd1b467af Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 17 Jun 2015 13:59:05 -0700 Subject: f2fs: convert inline_data for various fallocate For newly added fallocate types, it should convert inline_data before handling block swapping. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b0f38c3b37f4..fe8398f1d627 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -885,6 +885,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -1033,6 +1041,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi); + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) return ret; -- cgit v1.2.3 From 7a2cb67867b9a7f28a7c4d0fadd2f337a6d46ff7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 18 Jun 2015 14:17:04 -0700 Subject: f2fs: fix wrong block address calculation for a split extent This patch fixes wrong calculation on block address field when an extent is split. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f71e19a9dd3c..d1d86d53d1dc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -674,7 +674,7 @@ static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, endofs = dei.fofs + dei.len - 1; if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { set_extent_info(&ei, fofs + 1, - fofs - dei.fofs + dei.blk, endofs - fofs); + fofs - dei.fofs + dei.blk + 1, endofs - fofs); en2 = __insert_extent_tree(sbi, et, &ei, NULL); } } -- cgit v1.2.3 From cbe91923a97c96d6a931f4b5b7e32083218a0251 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 16 Jun 2015 15:17:01 -0700 Subject: f2fs: update on-disk extents even under extent_cache Previously, f2fs_update_extent_cache() updates in-memory extent_cache all the time, and then finally preserves its up-to-date extent into on-disk one during f2fs_evict_inode. But, in the following scenario: 1. mount 2. open & write an extent X 3. f2fs_evict_inode; on-disk extent is X 4. open & update the extent X with Y 5. sync; trigger checkpoint 6. power-cut after power-on, f2fs should serve extent Y, but we have an on-disk extent X. This causes a failure on xfstests/311. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d1d86d53d1dc..176e4ad4e5ed 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -899,9 +899,9 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; + /* we should call update_extent_info() to update on-disk extent */ if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) - return f2fs_update_extent_tree(dn->inode, fofs, - dn->data_blkaddr); + f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr); if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) sync_inode_page(dn); -- cgit v1.2.3 From 244f4fc1c530c4e486f0e4f0909c0514e4539ba2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 22 Jun 2015 18:22:38 -0700 Subject: f2fs: set cached_en after checking finally This patch relocates cached_en not only to be covered by spin_lock, but also to set once after checking out completely. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 176e4ad4e5ed..982a1a58efd7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -453,14 +453,12 @@ static struct extent_node *__lookup_extent_tree(struct extent_tree *et, while (node) { en = rb_entry(node, struct extent_node, rb_node); - if (fofs < en->ei.fofs) { + if (fofs < en->ei.fofs) node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { + else if (fofs >= en->ei.fofs + en->ei.len) node = node->rb_right; - } else { - et->cached_en = en; + else return en; - } } return NULL; } @@ -625,6 +623,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, spin_lock(&sbi->extent_lock); if (!list_empty(&en->list)) list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; spin_unlock(&sbi->extent_lock); stat_inc_read_hit(sbi->sb); } -- cgit v1.2.3 From 2658e50de61429f57d9496bfe371f232e2d039a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2015 12:01:21 -0700 Subject: f2fs: introduce a shrinker for mounted fs This patch introduces a shrinker targeting to reduce memory footprint consumed by a number of in-memory f2fs data structures. In addition, it newly adds: - sbi->umount_mutex to avoid data races on shrinker and put_super - sbi->shruinker_run_no to not revisit objects Note that the basic implementation was copied from fs/ubifs/shrinker.c Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Makefile | 1 + fs/f2fs/f2fs.h | 13 +++++++ fs/f2fs/shrinker.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/super.c | 31 +++++++++++++++- 4 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 fs/f2fs/shrinker.c (limited to 'fs/f2fs') diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 396be1a39e55..005251b8d459 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-y += shrinker.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3aaa4b99050a..e82af8c7ee8c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -791,6 +791,11 @@ struct f2fs_sb_info { /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; + + /* For shrinker support */ + struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; }; /* @@ -1951,6 +1956,14 @@ bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); +/* + * shrinker.c + */ +unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); +unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); +void f2fs_join_shrinker(struct f2fs_sb_info *); +void f2fs_leave_shrinker(struct f2fs_sb_info *); + /* * crypto support */ diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c new file mode 100644 index 000000000000..16e9b43635c2 --- /dev/null +++ b/fs/f2fs/shrinker.c @@ -0,0 +1,104 @@ +/* + * f2fs shrinker support + * the basic infra was copied from fs/ubifs/shrinker.c + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Jaegeuk Kim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include + +#include "f2fs.h" + +static LIST_HEAD(f2fs_list); +static DEFINE_SPINLOCK(f2fs_list_lock); +static unsigned int shrinker_run_no; + +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned long count = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + /* TODO: count # of objects */ + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + return count; +} + +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&f2fs_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + if (sbi->shrinker_run_no == run_no) + break; + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + sbi->shrinker_run_no = run_no; + + /* TODO: shrink caches */ + + spin_lock(&f2fs_list_lock); + p = p->next; + list_move_tail(&sbi->s_list, &f2fs_list); + mutex_unlock(&sbi->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&f2fs_list_lock); + return freed; +} + +void f2fs_join_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_add_tail(&sbi->s_list, &f2fs_list); + spin_unlock(&f2fs_list_lock); +} + +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_del(&sbi->s_list); + spin_unlock(&f2fs_list_lock); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index da277100dc90..bc7684b6d57a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,13 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +/* f2fs-wide shrinker description */ +static struct shrinker f2fs_shrinker_info = { + .scan_objects = f2fs_shrink_scan, + .count_objects = f2fs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + enum { Opt_gc_background, Opt_disable_roll_forward, @@ -500,6 +507,9 @@ static void f2fs_put_super(struct super_block *sb) stop_gc_thread(sbi); + /* prevent remaining shrinker jobs */ + mutex_lock(&sbi->umount_mutex); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -523,6 +533,9 @@ static void f2fs_put_super(struct super_block *sb) release_dirty_inode(sbi); release_discard_addrs(sbi); + f2fs_leave_shrinker(sbi); + mutex_unlock(&sbi->umount_mutex); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -972,6 +985,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dir_level = DEF_DIR_LEVEL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + + INIT_LIST_HEAD(&sbi->s_list); + mutex_init(&sbi->umount_mutex); } /* @@ -1214,6 +1230,8 @@ try_onemore: goto free_nm; } + f2fs_join_shrinker(sbi); + /* if there are nt orphan nodes free them */ recover_orphan_inodes(sbi); @@ -1310,7 +1328,10 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + mutex_lock(&sbi->umount_mutex); + f2fs_leave_shrinker(sbi); iput(sbi->node_inode); + mutex_unlock(&sbi->umount_mutex); free_nm: destroy_node_manager(sbi); free_sm: @@ -1406,13 +1427,20 @@ static int __init init_f2fs_fs(void) err = f2fs_init_crypto(); if (err) goto free_kset; - err = register_filesystem(&f2fs_fs_type); + + err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_crypto; + + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_shrinker; f2fs_create_root_stats(); f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_shrinker: + unregister_shrinker(&f2fs_shrinker_info); free_crypto: f2fs_exit_crypto(); free_kset: @@ -1435,6 +1463,7 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); + unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); f2fs_exit_crypto(); destroy_extent_cache(); -- cgit v1.2.3 From 1b38dc8e74a366b92986755c304591e330f3c3e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2015 15:36:07 -0700 Subject: f2fs: shrink nat_cache entries This patch registers shrinking nat_cache entries. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++--- fs/f2fs/segment.c | 8 ++++++-- fs/f2fs/shrinker.c | 11 +++++++++-- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7dd63b794bfb..a05eb35a372c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -328,11 +328,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; - if (available_free_memory(sbi, NAT_ENTRIES)) + if (!down_write_trylock(&nm_i->nat_tree_lock)) return 0; - down_write(&nm_i->nat_tree_lock); while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; ne = list_first_entry(&nm_i->nat_entries, @@ -341,7 +341,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) nr_shrink--; } up_write(&nm_i->nat_tree_lock); - return nr_shrink; + return nr - nr_shrink; } /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 61b97f9cb9f6..d5ee99258cbc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -306,8 +306,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) /* try to shrink extent cache when there is no enough memory */ f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); - /* check the # of cached NAT entries and prefree segments */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + /* check the # of cached NAT entries */ + if (!available_free_memory(sbi, NAT_ENTRIES)) + try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + /* checkpoint is the only way to shrink partial cached entries */ + if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES)) f2fs_sync_fs(sbi->sb, true); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 16e9b43635c2..c4bd6ee5936c 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -18,6 +18,11 @@ static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); static unsigned int shrinker_run_no; +static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; +} + unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -37,7 +42,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* TODO: count # of objects */ + /* shrink clean nat cache entries */ + count += __count_nat_entries(sbi); spin_lock(&f2fs_list_lock); p = p->next; @@ -76,7 +82,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; - /* TODO: shrink caches */ + /* shrink clean nat cache entries */ + freed += try_to_free_nats(sbi, nr); spin_lock(&f2fs_list_lock); p = p->next; -- cgit v1.2.3 From 554df79e523d14dab475eb6650cb96617256ceea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2015 13:41:23 -0700 Subject: f2fs: shrink extent_cache entries This patch registers shrinking extent_caches. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 19 +++++++++++-------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 3 ++- fs/f2fs/shrinker.c | 14 +++++++++++++- 4 files changed, 27 insertions(+), 11 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 982a1a58efd7..55b2a79b3526 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -767,7 +767,7 @@ out: update_inode_page(inode); } -void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; struct extent_node *en, *tmp; @@ -778,10 +778,7 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) unsigned int node_cnt = 0, tree_cnt = 0; if (!test_opt(sbi, EXTENT_CACHE)) - return; - - if (available_free_memory(sbi, EXTENT_CACHE)) - return; + return 0; spin_lock(&sbi->extent_lock); list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { @@ -791,7 +788,9 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&sbi->extent_lock); - down_read(&sbi->extent_tree_lock); + if (!down_read_trylock(&sbi->extent_tree_lock)) + goto out; + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -809,7 +808,9 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } up_read(&sbi->extent_tree_lock); - down_write(&sbi->extent_tree_lock); + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, F2FS_ROOT_INO(sbi)) { struct extent_tree *et = (struct extent_tree *)*slot; @@ -822,8 +823,10 @@ void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } } up_write(&sbi->extent_tree_lock); - +out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + + return node_cnt + tree_cnt; } void f2fs_destroy_extent_tree(struct inode *inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e82af8c7ee8c..eeef3eb45f8e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1754,7 +1754,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); void f2fs_destroy_extent_tree(struct inode *); void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); void f2fs_update_extent_cache(struct dnode_of_data *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d5ee99258cbc..f7bfc3b7d934 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -304,7 +304,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { /* try to shrink extent cache when there is no enough memory */ - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!available_free_memory(sbi, EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index c4bd6ee5936c..1f0a131be3d2 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -23,6 +23,11 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; } +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +{ + return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); +} + unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -42,6 +47,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); + /* count extent cache entries */ + count += __count_extent_cache(sbi); + /* shrink clean nat cache entries */ count += __count_nat_entries(sbi); @@ -82,8 +90,12 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; + /* shrink extent cache entries */ + freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + /* shrink clean nat cache entries */ - freed += try_to_free_nats(sbi, nr); + if (freed < nr) + freed += try_to_free_nats(sbi, nr - freed); spin_lock(&f2fs_list_lock); p = p->next; -- cgit v1.2.3 From 7daaea256de42da112805703e3c77f08973156b3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 25 Jun 2015 17:43:04 -0700 Subject: f2fs: add noextent_cache mount option This patch adds noextent_cache mount option. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 4 +++- fs/f2fs/super.c | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e9e750e59efc..e2d5105b7214 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -143,7 +143,9 @@ fastboot This option is used when a system wants to reduce mount extent_cache Enable an extent cache based on rb-tree, it can cache as many as extent which map between contiguous logical address and physical address per inode, resulting in - increasing the cache hit ratio. + increasing the cache hit ratio. Set by default. +noextent_cache Diable an extent cache based on rb-tree explicitly, see + the above extent_cache mount option. noinline_data Disable the inline data feature, inline data feature is enabled by default. diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bc7684b6d57a..92520228ce71 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -65,6 +65,7 @@ enum { Opt_nobarrier, Opt_fastboot, Opt_extent_cache, + Opt_noextent_cache, Opt_noinline_data, Opt_err, }; @@ -88,6 +89,7 @@ static match_table_t f2fs_tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, + {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_err, NULL}, }; @@ -389,6 +391,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_extent_cache: set_opt(sbi, EXTENT_CACHE); break; + case Opt_noextent_cache: + clear_opt(sbi, EXTENT_CACHE); + break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; @@ -662,6 +667,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",fastboot"); if (test_opt(sbi, EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); + else + seq_puts(seq, ",noextent_cache"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; -- cgit v1.2.3 From 3e72f721390dc14e7b33fda812843c0725810106 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2015 17:53:26 -0700 Subject: f2fs: use extent_cache by default We don't need to handle the duplicate extent information. The integrated rule is: - update on-disk extent with largest one tracked by in-memory extent_cache - destroy extent_tree for the truncation case - drop per-inode extent_cache by shrinker Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 357 ++++++++++++++++------------------------------------- fs/f2fs/f2fs.h | 20 ++- fs/f2fs/inode.c | 18 ++- fs/f2fs/namei.c | 2 + fs/f2fs/shrinker.c | 2 + fs/f2fs/super.c | 8 +- 6 files changed, 142 insertions(+), 265 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 55b2a79b3526..be0945cd9808 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -266,103 +266,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr; - - read_lock(&fi->ext_lock); - if (fi->ext.len == 0) { - read_unlock(&fi->ext_lock); - return false; - } - - stat_inc_total_hit(inode->i_sb); - - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk; - - if (pgofs >= start_fofs && pgofs <= end_fofs) { - *ei = fi->ext; - stat_inc_read_hit(inode->i_sb); - read_unlock(&fi->ext_lock); - return true; - } - read_unlock(&fi->ext_lock); - return false; -} - -static bool update_extent_info(struct inode *inode, pgoff_t fofs, - block_t blkaddr) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr, end_blkaddr; - int need_update = true; - - write_lock(&fi->ext_lock); - - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk; - end_blkaddr = fi->ext.blk + fi->ext.len - 1; - - /* Drop and initialize the matched extent */ - if (fi->ext.len == 1 && fofs == start_fofs) - fi->ext.len = 0; - - /* Initial extent */ - if (fi->ext.len == 0) { - if (blkaddr != NULL_ADDR) { - fi->ext.fofs = fofs; - fi->ext.blk = blkaddr; - fi->ext.len = 1; - } - goto end_update; - } - - /* Front merge */ - if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) { - fi->ext.fofs--; - fi->ext.blk--; - fi->ext.len++; - goto end_update; - } - - /* Back merge */ - if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) { - fi->ext.len++; - goto end_update; - } - - /* Split the existing extent */ - if (fi->ext.len > 1 && - fofs >= start_fofs && fofs <= end_fofs) { - if ((end_fofs - fofs) < (fi->ext.len >> 1)) { - fi->ext.len = fofs - start_fofs; - } else { - fi->ext.fofs = fofs + 1; - fi->ext.blk = start_blkaddr + fofs - start_fofs + 1; - fi->ext.len -= fofs - start_fofs + 1; - } - } else { - need_update = false; - } - - /* Finally, if the extent is very fragmented, let's drop the cache. */ - if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { - fi->ext.len = 0; - set_inode_flag(fi, FI_NO_EXTENT); - need_update = true; - } -end_update: - write_unlock(&fi->ext_lock); - return need_update; -} - static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct rb_node *parent, struct rb_node **p) @@ -394,23 +297,6 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, et->cached_en = NULL; } -static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi, - nid_t ino) -{ - struct extent_tree *et; - - down_read(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); - if (!et) { - up_read(&sbi->extent_tree_lock); - return NULL; - } - atomic_inc(&et->refcount); - up_read(&sbi->extent_tree_lock); - - return et; -} - static struct extent_tree *__grab_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -434,6 +320,9 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); + /* never died untill evict_inode */ + F2FS_I(inode)->extent_tree = et; + return et; } @@ -522,7 +411,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, en->ei.blk = ei->blk; en->ei.len += ei->len; *den = __try_back_merge(sbi, et, en); - return en; + goto update_out; } p = &(*p)->rb_left; } else if (ei->fofs >= en->ei.fofs + en->ei.len) { @@ -530,7 +419,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !den); en->ei.len += ei->len; *den = __try_front_merge(sbi, et, en); - return en; + goto update_out; } p = &(*p)->rb_right; } else { @@ -538,7 +427,14 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, } } - return __attach_extent_node(sbi, et, ei, parent, p); + en = __attach_extent_node(sbi, et, ei, parent, p); + if (!en) + return NULL; +update_out: + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + return en; } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, @@ -570,51 +466,56 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - et->count; } -static void f2fs_init_extent_tree(struct inode *inode, - struct f2fs_extent *i_ext) +static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; + + if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + largest->len = 0; +} + +void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + if (!f2fs_may_extent_tree(inode)) return; et = __grab_extent_tree(inode); - write_lock(&et->lock); - if (et->count) - goto out; + if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; set_extent_info(&ei, le32_to_cpu(i_ext->fofs), le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + write_lock(&et->lock); + if (et->count) + goto out; + en = __insert_extent_tree(sbi, et, &ei, NULL); if (en) { - et->cached_en = en; - spin_lock(&sbi->extent_lock); list_add_tail(&en->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); } out: write_unlock(&et->lock); - atomic_dec(&et->refcount); } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; + struct extent_tree *et = F2FS_I(inode)->extent_tree; struct extent_node *en; - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + f2fs_bug_on(sbi, !et); - et = __find_extent_tree(sbi, inode->i_ino); - if (!et) - return false; + trace_f2fs_lookup_extent_tree_start(inode, pgofs); read_lock(&et->lock); en = __lookup_extent_tree(et, pgofs); @@ -631,27 +532,38 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, read_unlock(&et->lock); trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); - - atomic_dec(&et->refcount); return en ? true : false; } -static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, +/* return true, if on-disk extent should be updated */ +static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; + struct extent_tree *et = F2FS_I(inode)->extent_tree; struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; struct extent_node *den = NULL; - struct extent_info ei, dei; + struct extent_info ei, dei, prev; unsigned int endofs; - trace_f2fs_update_extent_tree(inode, fofs, blkaddr); + if (!et) + return false; - et = __grab_extent_tree(inode); + trace_f2fs_update_extent_tree(inode, fofs, blkaddr); write_lock(&et->lock); + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + write_unlock(&et->lock); + return false; + } + + prev = et->largest; + dei.len = 0; + + /* we do not guarantee that the largest extent is cached all the time */ + __drop_largest_extent(inode, fofs); + /* 1. lookup and remove existing extent info in cache */ en = __lookup_extent_tree(et, fofs); if (!en) @@ -683,6 +595,14 @@ update_extent: if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, 1); en3 = __insert_extent_tree(sbi, et, &ei, &den); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + } } /* 4. update in global extent list */ @@ -714,57 +634,12 @@ update_extent: if (den) kmem_cache_free(extent_node_slab, den); - write_unlock(&et->lock); - atomic_dec(&et->refcount); -} - -void f2fs_preserve_extent_tree(struct inode *inode) -{ - struct extent_tree *et; - struct extent_info *ext = &F2FS_I(inode)->ext; - bool sync = false; - - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - return; - - et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino); - if (!et) { - if (ext->len) { - ext->len = 0; - update_inode_page(inode); - } - return; - } - - read_lock(&et->lock); - if (et->count) { - struct extent_node *en; - - if (et->cached_en) { - en = et->cached_en; - } else { - struct rb_node *node = rb_first(&et->root); - - if (!node) - node = rb_last(&et->root); - en = rb_entry(node, struct extent_node, rb_node); - } - - if (__is_extent_same(ext, &en->ei)) - goto out; + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + __free_extent_tree(sbi, et, true); - *ext = en->ei; - sync = true; - } else if (ext->len) { - ext->len = 0; - sync = true; - } -out: - read_unlock(&et->lock); - atomic_dec(&et->refcount); + write_unlock(&et->lock); - if (sync) - update_inode_page(inode); + return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -772,8 +647,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; struct extent_node *en, *tmp; unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_iter iter; - void **slot; + struct radix_tree_root *root = &sbi->extent_tree_root; unsigned int found; unsigned int node_cnt = 0, tree_cnt = 0; @@ -788,10 +662,10 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&sbi->extent_lock); - if (!down_read_trylock(&sbi->extent_tree_lock)) + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; - while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, + while ((found = radix_tree_gang_lookup(root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -799,27 +673,15 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) for (i = 0; i < found; i++) { struct extent_tree *et = treevec[i]; - atomic_inc(&et->refcount); write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et, false); write_unlock(&et->lock); - atomic_dec(&et->refcount); - } - } - up_read(&sbi->extent_tree_lock); - - if (!down_write_trylock(&sbi->extent_tree_lock)) - goto out; - - radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, - F2FS_ROOT_INO(sbi)) { - struct extent_tree *et = (struct extent_tree *)*slot; - - if (!atomic_read(&et->refcount) && !et->count) { - radix_tree_delete(&sbi->extent_tree_root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; + if (!atomic_read(&et->refcount) && !et->count) { + radix_tree_delete(root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + } } } up_write(&sbi->extent_tree_lock); @@ -829,63 +691,61 @@ out: return node_cnt + tree_cnt; } -void f2fs_destroy_extent_tree(struct inode *inode) +unsigned int f2fs_destroy_extent_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; + struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!test_opt(sbi, EXTENT_CACHE)) - return; - - et = __find_extent_tree(sbi, inode->i_ino); if (!et) - goto out; + return 0; - /* free all extent info belong to this extent tree */ write_lock(&et->lock); node_cnt = __free_extent_tree(sbi, et, true); write_unlock(&et->lock); - atomic_dec(&et->refcount); + return node_cnt; +} - /* try to find and delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino); - if (!et) { - up_write(&sbi->extent_tree_lock); - goto out; +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && et->count) { + atomic_dec(&et->refcount); + return; } + + /* free all extent info belong to this extent tree */ + node_cnt = f2fs_destroy_extent_node(inode); + + /* delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + atomic_dec(&et->refcount); f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); sbi->total_ext_tree--; up_write(&sbi->extent_tree_lock); -out: - trace_f2fs_destroy_extent_tree(inode, node_cnt); - return; -} -void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext) -{ - if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - f2fs_init_extent_tree(inode, i_ext); + F2FS_I(inode)->extent_tree = NULL; - write_lock(&F2FS_I(inode)->ext_lock); - get_extent_info(&F2FS_I(inode)->ext, *i_ext); - write_unlock(&F2FS_I(inode)->ext_lock); + trace_f2fs_destroy_extent_tree(inode, node_cnt); + return; } static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + if (!f2fs_may_extent_tree(inode)) return false; - if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - return f2fs_lookup_extent_tree(inode, pgofs, ei); - - return lookup_extent_info(inode, pgofs, ei); + return f2fs_lookup_extent_tree(inode, pgofs, ei); } void f2fs_update_extent_cache(struct dnode_of_data *dn) @@ -893,19 +753,15 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); - - if (is_inode_flag_set(fi, FI_NO_EXTENT)) + if (!f2fs_may_extent_tree(dn->inode)) return; + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; - /* we should call update_extent_info() to update on-disk extent */ - if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) - f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr); - - if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) + if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr)) sync_inode_page(dn); } @@ -1109,8 +965,6 @@ alloc: allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, &sum, seg); - - /* direct IO doesn't use extent cache to maximize the performance */ set_data_blkaddr(dn); /* update i_size */ @@ -1119,6 +973,9 @@ alloc: if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + /* direct IO doesn't use extent cache to maximize the performance */ + __drop_largest_extent(dn->inode, fofs); + return 0; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eeef3eb45f8e..1e6f54d8b464 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -349,6 +349,7 @@ struct extent_tree { nid_t ino; /* inode number */ struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ + struct extent_info largest; /* largested extent info */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t refcount; /* reference count of rb-tree */ unsigned int count; /* # of extent node in rb-tree*/ @@ -420,14 +421,14 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct extent_info ext; /* in-memory extent cache entry */ - rwlock_t ext_lock; /* rwlock for single extent cache */ struct inode_entry *dirty_dir; /* the pointer of dirty dir */ struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ + struct extent_tree *extent_tree; /* cached extent_tree entry */ + #ifdef CONFIG_F2FS_FS_ENCRYPTION /* Encryption params */ struct f2fs_crypt_info *i_crypt_info; @@ -1548,6 +1549,17 @@ static inline bool is_dot_dotdot(const struct qstr *str) return false; } +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + mode_t mode = inode->i_mode; + + if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + return false; + + return S_ISREG(mode); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1755,10 +1767,10 @@ void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); -void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); void f2fs_update_extent_cache(struct dnode_of_data *); -void f2fs_preserve_extent_tree(struct inode *); struct page *get_read_data_page(struct inode *, pgoff_t, int); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 757fed253697..978a7261a791 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -139,7 +139,7 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_cache(inode, &ri->i_ext); + f2fs_init_extent_tree(inode, &ri->i_ext); get_inline_info(fi, ri); @@ -237,10 +237,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - read_lock(&F2FS_I(inode)->ext_lock); - set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); - read_unlock(&F2FS_I(inode)->ext_lock); - + if (F2FS_I(inode)->extent_tree) + set_raw_extent(&F2FS_I(inode)->extent_tree->largest, + &ri->i_ext); + else + memset(&ri->i_ext, 0, sizeof(ri->i_ext)); set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -331,6 +332,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); remove_dirty_dir_inode(inode); + f2fs_destroy_extent_tree(inode); + if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; @@ -350,11 +353,6 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - /* update extent info in inode */ - if (inode->i_nlink) - f2fs_preserve_extent_tree(inode); - f2fs_destroy_extent_tree(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 08656fca8f83..df315dcdd35d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -65,6 +65,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + f2fs_init_extent_tree(inode, NULL); + stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 1f0a131be3d2..9aa4235cd304 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -117,6 +117,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { + f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + spin_lock(&f2fs_list_lock); list_del(&sbi->s_list); spin_unlock(&f2fs_list_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 92520228ce71..0083b8559c9b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -422,7 +422,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; - rwlock_init(&fi->ext_lock); init_rwsem(&fi->i_sem); INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); @@ -453,12 +452,17 @@ static int f2fs_drop_inode(struct inode *inode) */ if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { if (!inode->i_nlink && !is_bad_inode(inode)) { + /* to avoid evict_inode call simultaneously */ + atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) commit_inmem_pages(inode, true); + /* should remain fi->extent_tree for writepage */ + f2fs_destroy_extent_node(inode); + sb_start_intwrite(inode->i_sb); i_size_write(inode, 0); @@ -473,6 +477,7 @@ static int f2fs_drop_inode(struct inode *inode) F2FS_I(inode)->i_crypt_info); #endif spin_lock(&inode->i_lock); + atomic_dec(&inode->i_count); } return 0; } @@ -721,6 +726,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, EXTENT_CACHE); #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); -- cgit v1.2.3 From 84bc926c076963d5b992640f5c8d242754801fd6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 29 Jun 2015 16:01:14 -0700 Subject: f2fs: check the largest extent at look-up time Because of the extent shrinker or other -ENOMEM scenarios, it cannot guarantee that the largest extent would be cached in the tree all the time. Instead of relying on extent_tree, we can simply check the cached one in extent tree accordingly. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 16 ++++++++++++++-- include/trace/events/f2fs.h | 12 ++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index be0945cd9808..cdc1c2b781b8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -512,12 +512,22 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; struct extent_node *en; + bool ret = false; f2fs_bug_on(sbi, !et); trace_f2fs_lookup_extent_tree_start(inode, pgofs); read_lock(&et->lock); + + if (et->largest.fofs <= pgofs && + et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_read_hit(sbi->sb); + goto out; + } + en = __lookup_extent_tree(et, pgofs); if (en) { *ei = en->ei; @@ -526,13 +536,15 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, list_move_tail(&en->list, &sbi->extent_list); et->cached_en = en; spin_unlock(&sbi->extent_lock); + ret = true; stat_inc_read_hit(sbi->sb); } +out: stat_inc_total_hit(sbi->sb); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); - return en ? true : false; + trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + return ret; } /* return true, if on-disk extent should be updated */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 04856a2d8c82..a01946514b5a 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1099,11 +1099,11 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start, TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, - struct extent_node *en), + struct extent_info *ei), - TP_ARGS(inode, pgofs, en), + TP_ARGS(inode, pgofs, ei), - TP_CONDITION(en), + TP_CONDITION(ei), TP_STRUCT__entry( __field(dev_t, dev) @@ -1118,9 +1118,9 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; - __entry->fofs = en->ei.fofs; - __entry->blk = en->ei.blk; - __entry->len = en->ei.len; + __entry->fofs = ei->fofs; + __entry->blk = ei->blk; + __entry->len = ei->len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " -- cgit v1.2.3 From c1079892f4e8ecfd1bbc525cbfc1bd46b470888e Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Tue, 30 Jun 2015 21:37:21 -0400 Subject: f2fs: make the function check_dnode have a return type of bool and change it's name to is_alive This makes the function check_dnode have a return type of bool due to this particular function only ever returning either one or zero as its return value and changes the name of the function to is_alive in order to better explain this function's intended work of checking if a dnode is still in use by the filesystem. Signed-off-by: Nicholas Krause [Jaegeuk Kim: change the return value check for the renamed function] Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 22fb5ef37966..2701e05af991 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -487,7 +487,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); } -static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info *dni, block_t blkaddr, unsigned int *nofs) { struct page *node_page; @@ -500,13 +500,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) - return 0; + return false; get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_put_page(node_page, 1); - return 0; + return false; } *nofs = ofs_of_node(node_page); @@ -514,8 +514,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) - return 0; - return 1; + return false; + return true; } static void move_encrypted_block(struct inode *inode, block_t bidx) @@ -670,7 +670,7 @@ next_step: } /* Get an inode by ino with checking validity */ - if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) + if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; if (phase == 1) { -- cgit v1.2.3 From 741a7bea79eae6361c8d7499f1f6a900b65c120e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Jul 2015 20:30:40 +0800 Subject: f2fs: restrict multimedia filename When testing with fs_mark, some blocks were written out as cold data which were mixed with warm data, resulting in splitting more bios. This is because fs_mark will create file with random filename as below: 559551ee~~~~~~~~15Z29OCC05JCKQP60JQ42MKV 559551ee~~~~~~~~NZAZ6X8OA8LHIIP6XD0L58RM 559551ef~~~~~~~~B15YDSWAK789HPSDZKYTW6WM 559551f1~~~~~~~~2DAE5DPS79785BUNTFWBEMP3 559551f1~~~~~~~~1MYDY0BKSQCJPI32Q8C514RM 559551f1~~~~~~~~YQOTMAOMN5CVRFOUNI026MP4 559551f3~~~~~~~~1WF42LPRTQJNPPGR3EINKMPE 559551f3~~~~~~~~8Y2NRK7CEPPAA02LY936PJPG They are regarded as cold file since their filename are ended with multimedia files' extension, but this should be wrong as we only match the extension of filename, not the whole one. In this patch, we try to fix the format of multimedia filename to: "filename + '.' + extension", then we set cold file only its filename matches the format. So after this change, it will reduce the probability we set the wrong cold file, also it helps a little for fs_mark's performance on f2fs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index df315dcdd35d..1856d5ecd809 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -91,7 +91,14 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) size_t slen = strlen(s); size_t sublen = strlen(sub); - if (sublen > slen) + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension". + */ + if (slen < sublen + 2) + return 0; + + if (s[slen - sublen - 1] != '.') return 0; return !strncasecmp(s + slen - sublen, sub, sublen); -- cgit v1.2.3 From bb96a8d51e523c162b436c4545eb1a4e9f9f530e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Jul 2015 20:31:49 +0800 Subject: f2fs: enhance multithread performance In ->writepages, we use writepages mutex lock to serialize all block address allocation and page submitting pairs from different inodes. This method makes our delayed dirty pages of one inode being written continously as many as possible. But there is one problem that we did not submit current cached bio in protection region of writepages mutex lock, so there is a small chance that we submit the one of other thread's as below, resulting in splitting more bios. thread 1 thread 2 ->writepages lock(writepages) ->write_cache_pages unlock(writepages) lock(writepages) ->write_cache_pages ->f2fs_submit_merged_bio ->writepage unlock(writepages) fs_mark-6535 [002] .... 2242.270230: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, DATA, sector = 5766152, size = 524288 fs_mark-6536 [000] .... 2242.270361: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, DATA, sector = 5767176, size = 4096 fs_mark-6536 [000] .... 2242.270370: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, NODE, sector = 8138112, size = 4096 fs_mark-6535 [002] .... 2242.270776: f2fs_submit_write_bio: dev = (1,0), WRITE_SYNC, DATA, sector = 5767184, size = 516096 This may really increase time of block layer works, and may cause larger IO lantency. This patch moves the submitting operation into region of writepages mutex lock to avoid bio splits when concurrently writebacking is intensive. my test environment: virtual machine, intel cpu i5 2500, 8GB size memory, 4GB size ramdisk time fs_mark -t 16 -L 1 -s 524288 -S 1 -d /mnt/f2fs/ before: real 0m4.244s user 0m0.088s sys 0m12.336s after: real 0m3.822s user 0m0.072s sys 0m10.760s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdc1c2b781b8..3e4402f661d7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1672,11 +1672,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, locked = true; } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + f2fs_submit_merged_bio(sbi, DATA, WRITE); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - remove_dirty_dir_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); -- cgit v1.2.3 From 7023a1ad17f4e21acb74167ab647cd123d9eb801 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 29 Jun 2015 16:34:39 -0700 Subject: f2fs: shrink unreferenced extent_caches first If an extent_tree entry has a zero reference count, we can drop it from the cache in higher priority rather than currently referencing entries. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3e4402f661d7..c9d0f8b06d15 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -662,21 +662,54 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) struct radix_tree_root *root = &sbi->extent_tree_root; unsigned int found; unsigned int node_cnt = 0, tree_cnt = 0; + int remained; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + if (!atomic_read(&et->refcount)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + radix_tree_delete(root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + } + } + } + up_write(&sbi->extent_tree_lock); + + /* 2. remove LRU extent entries */ + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + spin_lock(&sbi->extent_lock); list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!nr_shrink--) + if (!remained--) break; list_del_init(&en->list); } spin_unlock(&sbi->extent_lock); - if (!down_write_trylock(&sbi->extent_tree_lock)) - goto out; - while ((found = radix_tree_gang_lookup(root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -688,14 +721,12 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et, false); write_unlock(&et->lock); - if (!atomic_read(&et->refcount) && !et->count) { - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - } + + if (node_cnt + tree_cnt >= nr_shrink) + break; } } +unlock_out: up_write(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); -- cgit v1.2.3 From 90d4388ac2cec0c83cad7315d3cd0065553430e1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Jul 2015 18:24:38 +0800 Subject: f2fs: fix to update page flag This patch fixes to update page flag (e.g. Uptodate/cold flag) in ->write_begin. Otherwise, page will be non-uptodate when we try to write entire page, and cold data flag in page will not be clean when gced page is being rewritten. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c9d0f8b06d15..de55c088948f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1790,8 +1790,10 @@ put_next: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) - return 0; + if (len == PAGE_CACHE_SIZE) + goto out_update; + if (PageUptodate(page)) + goto out_clear; f2fs_wait_on_page_writeback(page, DATA); @@ -1801,7 +1803,7 @@ put_next: /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out; + goto out_update; } if (dn.data_blkaddr == NEW_ADDR) { @@ -1839,8 +1841,9 @@ put_next: } } } -out: +out_update: SetPageUptodate(page); +out_clear: clear_cold_data(page); return 0; -- cgit v1.2.3 From 3c7df87dad065a4656b13115593c59c8a324a108 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Wed, 8 Jul 2015 16:02:54 +0800 Subject: f2fs: don't try to split extents shorter than F2FS_MIN_EXTENT_LEN Since only parts of extents longer than F2FS_MIN_EXTENT_LEN will be kept in extent cache after split, extents already shorter than F2FS_MIN_EXTENT_LEN don't need to try split at all. Signed-off-by: Fan Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index de55c088948f..ce0d5ec8e770 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -585,7 +585,7 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, __detach_extent_node(sbi, et, en); /* 2. if extent can be split more, split and insert the left part */ - if (dei.len > 1) { + if (dei.len > F2FS_MIN_EXTENT_LEN) { /* insert left part of split extent into cache */ if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { set_extent_info(&ei, dei.fofs, dei.blk, -- cgit v1.2.3 From a28ef1f5aebe1068fc5fd65c4699c1c3b1e9094b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Jul 2015 17:59:36 +0800 Subject: f2fs: maintain extent cache in separated file This patch moves extent cache related code from data.c into extent_cache.c since extent cache is independent feature, and its codes are not relate to others in data.c, it's better for us to maintain them in separated place. There is no functionality change, but several small coding style fixes including: * rename __drop_largest_extent to f2fs_drop_largest_extent for exporting; * rename misspelled word 'untill' to 'until'; * remove unneeded 'return' in the end of f2fs_destroy_extent_tree(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Makefile | 2 +- fs/f2fs/data.c | 578 +---------------------------------------------- fs/f2fs/extent_cache.c | 594 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 22 +- 4 files changed, 610 insertions(+), 586 deletions(-) create mode 100644 fs/f2fs/extent_cache.c (limited to 'fs/f2fs') diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 005251b8d459..08e101ed914c 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o +f2fs-y += shrinker.o extent_cache.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ce0d5ec8e770..ef30b59756c6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -26,9 +26,6 @@ #include "trace.h" #include -static struct kmem_cache *extent_tree_slab; -static struct kmem_cache *extent_node_slab; - static void f2fs_read_end_io(struct bio *bio, int err) { struct bio_vec *bvec; @@ -266,548 +263,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct rb_node *parent, struct rb_node **p) -{ - struct extent_node *en; - - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); - if (!en) - return NULL; - - en->ei = *ei; - INIT_LIST_HEAD(&en->list); - - rb_link_node(&en->rb_node, parent, p); - rb_insert_color(&en->rb_node, &et->root); - et->count++; - atomic_inc(&sbi->total_ext_node); - return en; -} - -static void __detach_extent_node(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - rb_erase(&en->rb_node, &et->root); - et->count--; - atomic_dec(&sbi->total_ext_node); - - if (et->cached_en == en) - et->cached_en = NULL; -} - -static struct extent_tree *__grab_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - nid_t ino = inode->i_ino; - - down_write(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); - if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); - memset(et, 0, sizeof(struct extent_tree)); - et->ino = ino; - et->root = RB_ROOT; - et->cached_en = NULL; - rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; - } - atomic_inc(&et->refcount); - up_write(&sbi->extent_tree_lock); - - /* never died untill evict_inode */ - F2FS_I(inode)->extent_tree = et; - - return et; -} - -static struct extent_node *__lookup_extent_tree(struct extent_tree *et, - unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en; - - if (et->cached_en) { - struct extent_info *cei = &et->cached_en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - return et->cached_en; - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - node = node->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - node = node->rb_right; - else - return en; - } - return NULL; -} - -static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *prev; - struct rb_node *node; - - node = rb_prev(&en->rb_node); - if (!node) - return NULL; - - prev = rb_entry(node, struct extent_node, rb_node); - if (__is_back_mergeable(&en->ei, &prev->ei)) { - en->ei.fofs = prev->ei.fofs; - en->ei.blk = prev->ei.blk; - en->ei.len += prev->ei.len; - __detach_extent_node(sbi, et, prev); - return prev; - } - return NULL; -} - -static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *next; - struct rb_node *node; - - node = rb_next(&en->rb_node); - if (!node) - return NULL; - - next = rb_entry(node, struct extent_node, rb_node); - if (__is_front_mergeable(&en->ei, &next->ei)) { - en->ei.len += next->ei.len; - __detach_extent_node(sbi, et, next); - return next; - } - return NULL; -} - -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct extent_node **den) -{ - struct rb_node **p = &et->root.rb_node; - struct rb_node *parent = NULL; - struct extent_node *en; - - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) { - if (__is_front_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.fofs = ei->fofs; - en->ei.blk = ei->blk; - en->ei.len += ei->len; - *den = __try_back_merge(sbi, et, en); - goto update_out; - } - p = &(*p)->rb_left; - } else if (ei->fofs >= en->ei.fofs + en->ei.len) { - if (__is_back_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.len += ei->len; - *den = __try_front_merge(sbi, et, en); - goto update_out; - } - p = &(*p)->rb_right; - } else { - f2fs_bug_on(sbi, 1); - } - } - - en = __attach_extent_node(sbi, et, ei, parent, p); - if (!en) - return NULL; -update_out: - if (en->ei.len > et->largest.len) - et->largest = en->ei; - et->cached_en = en; - return en; -} - -static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) -{ - struct rb_node *node, *next; - struct extent_node *en; - unsigned int count = et->count; - - node = rb_first(&et->root); - while (node) { - next = rb_next(node); - en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } - node = next; - } - - return count - et->count; -} - -static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - - if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) - largest->len = 0; -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en; - struct extent_info ei; - - if (!f2fs_may_extent_tree(inode)) - return; - - et = __grab_extent_tree(inode); - - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; - - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); - - write_lock(&et->lock); - if (et->count) - goto out; - - en = __insert_extent_tree(sbi, et, &ei, NULL); - if (en) { - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - } -out: - write_unlock(&et->lock); -} - -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; - struct extent_node *en; - bool ret = false; - - f2fs_bug_on(sbi, !et); - - trace_f2fs_lookup_extent_tree_start(inode, pgofs); - - read_lock(&et->lock); - - if (et->largest.fofs <= pgofs && - et->largest.fofs + et->largest.len > pgofs) { - *ei = et->largest; - ret = true; - stat_inc_read_hit(sbi->sb); - goto out; - } - - en = __lookup_extent_tree(et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - spin_unlock(&sbi->extent_lock); - ret = true; - stat_inc_read_hit(sbi->sb); - } -out: - stat_inc_total_hit(sbi->sb); - read_unlock(&et->lock); - - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); - return ret; -} - -/* return true, if on-disk extent should be updated */ -static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, - block_t blkaddr) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; - struct extent_node *den = NULL; - struct extent_info ei, dei, prev; - unsigned int endofs; - - if (!et) - return false; - - trace_f2fs_update_extent_tree(inode, fofs, blkaddr); - - write_lock(&et->lock); - - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { - write_unlock(&et->lock); - return false; - } - - prev = et->largest; - dei.len = 0; - - /* we do not guarantee that the largest extent is cached all the time */ - __drop_largest_extent(inode, fofs); - - /* 1. lookup and remove existing extent info in cache */ - en = __lookup_extent_tree(et, fofs); - if (!en) - goto update_extent; - - dei = en->ei; - __detach_extent_node(sbi, et, en); - - /* 2. if extent can be split more, split and insert the left part */ - if (dei.len > F2FS_MIN_EXTENT_LEN) { - /* insert left part of split extent into cache */ - if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - fofs - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len - 1; - if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, fofs + 1, - fofs - dei.fofs + dei.blk + 1, endofs - fofs); - en2 = __insert_extent_tree(sbi, et, &ei, NULL); - } - } - -update_extent: - /* 3. update extent in extent cache */ - if (blkaddr) { - set_extent_info(&ei, fofs, blkaddr, 1); - en3 = __insert_extent_tree(sbi, et, &ei, &den); - - /* give up extent_cache, if split and small updates happen */ - if (dei.len >= 1 && - prev.len < F2FS_MIN_EXTENT_LEN && - et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); - } - } - - /* 4. update in global extent list */ - spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) - list_del(&en->list); - /* - * en1 and en2 split from en, they will become more and more smaller - * fragments after splitting several times. So if the length is smaller - * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. - */ - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); - else - list_move_tail(&en3->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - /* 5. release extent node */ - if (en) - kmem_cache_free(extent_node_slab, en); - if (den) - kmem_cache_free(extent_node_slab, den); - - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - __free_extent_tree(sbi, et, true); - - write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); -} - -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) -{ - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; - unsigned int found; - unsigned int node_cnt = 0, tree_cnt = 0; - int remained; - - if (!test_opt(sbi, EXTENT_CACHE)) - return 0; - - if (!down_write_trylock(&sbi->extent_tree_lock)) - goto out; - - /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } - } - up_write(&sbi->extent_tree_lock); - - /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) - goto out; - - remained = nr_shrink - (node_cnt + tree_cnt); - - spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!remained--) - break; - list_del_init(&en->list); - } - spin_unlock(&sbi->extent_lock); - - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - - if (node_cnt + tree_cnt >= nr_shrink) - break; - } - } -unlock_out: - up_write(&sbi->extent_tree_lock); -out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); - - return node_cnt + tree_cnt; -} - -unsigned int f2fs_destroy_extent_node(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; - unsigned int node_cnt = 0; - - if (!et) - return 0; - - write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - return node_cnt; -} - -void f2fs_destroy_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; - unsigned int node_cnt = 0; - - if (!et) - return; - - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); - return; - } - - /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); - - /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - up_write(&sbi->extent_tree_lock); - - F2FS_I(inode)->extent_tree = NULL; - - trace_f2fs_destroy_extent_tree(inode, node_cnt); - return; -} - -static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (!f2fs_may_extent_tree(inode)) - return false; - - return f2fs_lookup_extent_tree(inode, pgofs, ei); -} - -void f2fs_update_extent_cache(struct dnode_of_data *dn) -{ - struct f2fs_inode_info *fi = F2FS_I(dn->inode); - pgoff_t fofs; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr)) - sync_inode_page(dn); -} - struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) { struct address_space *mapping = inode->i_mapping; @@ -1017,7 +472,7 @@ alloc: i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); /* direct IO doesn't use extent cache to maximize the performance */ - __drop_largest_extent(dn->inode, fofs); + f2fs_drop_largest_extent(dn->inode, fofs); return 0; } @@ -1997,37 +1452,6 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block); } -void init_extent_cache_info(struct f2fs_sb_info *sbi) -{ - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; - atomic_set(&sbi->total_ext_node, 0); -} - -int __init create_extent_cache(void) -{ - extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", - sizeof(struct extent_tree)); - if (!extent_tree_slab) - return -ENOMEM; - extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", - sizeof(struct extent_node)); - if (!extent_node_slab) { - kmem_cache_destroy(extent_tree_slab); - return -ENOMEM; - } - return 0; -} - -void destroy_extent_cache(void) -{ - kmem_cache_destroy(extent_node_slab); - kmem_cache_destroy(extent_tree_slab); -} - const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c new file mode 100644 index 000000000000..5f78fc1e818a --- /dev/null +++ b/fs/f2fs/extent_cache.c @@ -0,0 +1,594 @@ +/* + * f2fs extent cache support + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Samsung Electronics + * Authors: Jaegeuk Kim + * Chao Yu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include "f2fs.h" +#include "node.h" +#include + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) +{ + struct extent_node *en; + + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + et->count++; + atomic_inc(&sbi->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + et->count--; + atomic_dec(&sbi->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + atomic_set(&et->refcount, 0); + et->count = 0; + sbi->total_ext_tree++; + } + atomic_inc(&et->refcount); + up_write(&sbi->extent_tree_lock); + + /* never died until evict_inode */ + F2FS_I(inode)->extent_tree = et; + + return et; +} + +static struct extent_node *__lookup_extent_tree(struct extent_tree *et, + unsigned int fofs) +{ + struct rb_node *node = et->root.rb_node; + struct extent_node *en; + + if (et->cached_en) { + struct extent_info *cei = &et->cached_en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + return et->cached_en; + } + + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) + node = node->rb_left; + else if (fofs >= en->ei.fofs + en->ei.len) + node = node->rb_right; + else + return en; + } + return NULL; +} + +static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *prev; + struct rb_node *node; + + node = rb_prev(&en->rb_node); + if (!node) + return NULL; + + prev = rb_entry(node, struct extent_node, rb_node); + if (__is_back_mergeable(&en->ei, &prev->ei)) { + en->ei.fofs = prev->ei.fofs; + en->ei.blk = prev->ei.blk; + en->ei.len += prev->ei.len; + __detach_extent_node(sbi, et, prev); + return prev; + } + return NULL; +} + +static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *next; + struct rb_node *node; + + node = rb_next(&en->rb_node); + if (!node) + return NULL; + + next = rb_entry(node, struct extent_node, rb_node); + if (__is_front_mergeable(&en->ei, &next->ei)) { + en->ei.len += next->ei.len; + __detach_extent_node(sbi, et, next); + return next; + } + return NULL; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en; + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + if (__is_front_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.fofs = ei->fofs; + en->ei.blk = ei->blk; + en->ei.len += ei->len; + *den = __try_back_merge(sbi, et, en); + goto update_out; + } + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + if (__is_back_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.len += ei->len; + *den = __try_front_merge(sbi, et, en); + goto update_out; + } + p = &(*p)->rb_right; + } else { + f2fs_bug_on(sbi, 1); + } + } + + en = __attach_extent_node(sbi, et, ei, parent, p); + if (!en) + return NULL; +update_out: + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + return en; +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, bool free_all) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = et->count; + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + + if (free_all) { + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + } + + if (free_all || list_empty(&en->list)) { + __detach_extent_node(sbi, et, en); + kmem_cache_free(extent_node_slab, en); + } + node = next; + } + + return count - et->count; +} + +void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; + + if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + largest->len = 0; +} + +void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (!f2fs_may_extent_tree(inode)) + return; + + et = __grab_extent_tree(inode); + + if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; + + set_extent_info(&ei, le32_to_cpu(i_ext->fofs), + le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + + write_lock(&et->lock); + if (et->count) + goto out; + + en = __insert_extent_tree(sbi, et, &ei, NULL); + if (en) { + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en; + bool ret = false; + + f2fs_bug_on(sbi, !et); + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + read_lock(&et->lock); + + if (et->largest.fofs <= pgofs && + et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_read_hit(sbi->sb); + goto out; + } + + en = __lookup_extent_tree(et, pgofs); + if (en) { + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; + spin_unlock(&sbi->extent_lock); + ret = true; + stat_inc_read_hit(sbi->sb); + } +out: + stat_inc_total_hit(sbi->sb); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + return ret; +} + +/* return true, if on-disk extent should be updated */ +static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, + block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *den = NULL; + struct extent_info ei, dei, prev; + unsigned int endofs; + + if (!et) + return false; + + trace_f2fs_update_extent_tree(inode, fofs, blkaddr); + + write_lock(&et->lock); + + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + write_unlock(&et->lock); + return false; + } + + prev = et->largest; + dei.len = 0; + + /* we do not guarantee that the largest extent is cached all the time */ + f2fs_drop_largest_extent(inode, fofs); + + /* 1. lookup and remove existing extent info in cache */ + en = __lookup_extent_tree(et, fofs); + if (!en) + goto update_extent; + + dei = en->ei; + __detach_extent_node(sbi, et, en); + + /* 2. if extent can be split more, split and insert the left part */ + if (dei.len > F2FS_MIN_EXTENT_LEN) { + /* insert left part of split extent into cache */ + if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + fofs - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len - 1; + if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, fofs + 1, + fofs - dei.fofs + dei.blk + 1, endofs - fofs); + en2 = __insert_extent_tree(sbi, et, &ei, NULL); + } + } + +update_extent: + /* 3. update extent in extent cache */ + if (blkaddr) { + set_extent_info(&ei, fofs, blkaddr, 1); + en3 = __insert_extent_tree(sbi, et, &ei, &den); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + } + } + + /* 4. update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + /* + * en1 and en2 split from en, they will become more and more smaller + * fragments after splitting several times. So if the length is smaller + * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. + */ + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); + + /* 5. release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); + if (den) + kmem_cache_free(extent_node_slab, den); + + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + __free_extent_tree(sbi, et, true); + + write_unlock(&et->lock); + + return !__is_extent_same(&prev, &et->largest); +} + +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_node *en, *tmp; + unsigned long ino = F2FS_ROOT_INO(sbi); + struct radix_tree_root *root = &sbi->extent_tree_root; + unsigned int found; + unsigned int node_cnt = 0, tree_cnt = 0; + int remained; + + if (!test_opt(sbi, EXTENT_CACHE)) + return 0; + + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + if (!atomic_read(&et->refcount)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + radix_tree_delete(root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + } + } + } + up_write(&sbi->extent_tree_lock); + + /* 2. remove LRU extent entries */ + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + + spin_lock(&sbi->extent_lock); + list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { + if (!remained--) + break; + list_del_init(&en->list); + } + spin_unlock(&sbi->extent_lock); + + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + + if (node_cnt + tree_cnt >= nr_shrink) + break; + } + } +unlock_out: + up_write(&sbi->extent_tree_lock); +out: + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + + return node_cnt + tree_cnt; +} + +unsigned int f2fs_destroy_extent_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return 0; + + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + return node_cnt; +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && et->count) { + atomic_dec(&et->refcount); + return; + } + + /* free all extent info belong to this extent tree */ + node_cnt = f2fs_destroy_extent_node(inode); + + /* delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + atomic_dec(&et->refcount); + f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + up_write(&sbi->extent_tree_lock); + + F2FS_I(inode)->extent_tree = NULL; + + trace_f2fs_destroy_extent_tree(inode, node_cnt); +} + +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!f2fs_may_extent_tree(inode)) + return false; + + return f2fs_lookup_extent_tree(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs; + + if (!f2fs_may_extent_tree(dn->inode)) + return; + + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + + if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr)) + sync_inode_page(dn); +} + +void init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + init_rwsem(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_node, 0); +} + +int __init create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1e6f54d8b464..88b05cba3d4a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1766,20 +1766,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -void f2fs_update_extent_cache(struct dnode_of_data *); struct page *get_read_data_page(struct inode *, pgoff_t, int); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void init_extent_cache_info(struct f2fs_sb_info *); -int __init create_extent_cache(void); -void destroy_extent_cache(void); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1976,6 +1968,20 @@ unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); void f2fs_join_shrinker(struct f2fs_sb_info *); void f2fs_leave_shrinker(struct f2fs_sb_info *); +/* + * extent_cache.c + */ +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_drop_largest_extent(struct inode *, pgoff_t); +void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +unsigned int f2fs_destroy_extent_node(struct inode *); +void f2fs_destroy_extent_tree(struct inode *); +bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); +void f2fs_update_extent_cache(struct dnode_of_data *); +void init_extent_cache_info(struct f2fs_sb_info *); +int __init create_extent_cache(void); +void destroy_extent_cache(void); + /* * crypto support */ -- cgit v1.2.3 From c1c1b58359d45e1a9f236ce5a40d50720c07c70e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 10 Jul 2015 18:08:10 +0800 Subject: f2fs: add new ioctl F2FS_IOC_GARBAGE_COLLECT When background gc is off, the only way to trigger gc is executing a force gc in some operations who wants to grab space in disk. The executing condition is limited: to execute force gc, we should wait for the time when there is almost no more free section for LFS allocation. This seems not reasonable for our user who wants to control triggering gc by himself. This patch introduces F2FS_IOC_GARBAGE_COLLECT interface for triggering garbage collection by using ioctl. It provides our users one more option to trigger gc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 32 ++++++++++++++++++++++++++++++++ fs/f2fs/gc.h | 6 ++++++ 3 files changed, 39 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 88b05cba3d4a..673623b36901 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -228,6 +228,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fe8398f1d627..dcc01137fca0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -27,6 +27,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include "gc.h" #include "trace.h" #include @@ -1558,6 +1559,35 @@ got_it: return 0; } +static int f2fs_ioc_gc(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 i, count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(count, (__u32 __user *)arg)) + return -EFAULT; + + if (!count || count > F2FS_BATCH_GC_MAX_NUM) + return -EINVAL; + + for (i = 0; i < count; i++) { + if (!mutex_trylock(&sbi->gc_mutex)) + break; + + if (f2fs_gc(sbi)) + break; + } + + if (put_user(i, (__u32 __user *)arg)) + return -EFAULT; + + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1587,6 +1617,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_policy(filp, arg); case F2FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT: + return f2fs_ioc_gc(filp, arg); default: return -ENOTTY; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..c5a055b3376e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,6 +19,12 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +/* + * with this macro, we can control the max time we do garbage collection, + * when user triggers batch mode gc by ioctl. + */ +#define F2FS_BATCH_GC_MAX_NUM 16 + /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ -- cgit v1.2.3 From 5b3391244d1c89bb4c8e1b4e4916fb4965fb71f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Jul 2015 17:43:19 +0800 Subject: f2fs: warm up cold page after mmaped write With cost-benifit method, background gc will consider old section with fewer valid blocks as candidate victim, these old blocks in section will be treated as cold data, and laterly will be moved into cold segment. But if the gcing page is attached by user through buffered or mmaped write, we should reset the page as non-cold one, because this page may have more opportunity for further updating. So fix to add clearing code for the missed 'mmap' case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dcc01137fca0..9c40f8cfb77c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -86,6 +86,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + /* if gced page is attached, don't write to cold segment */ + clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); -- cgit v1.2.3 From bd936f840779366b61300c0f4f752dd1b52b1ca3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Jul 2015 17:44:25 +0800 Subject: f2fs: cleanup write_orphan_inodes Previously, since 'commit 4531929e3922 ("f2fs: move grabing orphan pages out of protection region")' was committed, in write_orphan_inodes(), we will grab all meta page in a batch before we use them under spinlock, so that we can avoid large time delay of grabbing meta pages under spinlock. Now, 'commit d6c67a4fee86 ("f2fs: revmove spin_lock for write_orphan_inodes")' remove the spinlock in write_orphan_inodes, so there is no issue we describe above, we'd better recover to move the grab operation to original place for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index de7a0d6a371a..60327027137f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -504,7 +504,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; - unsigned short index; + unsigned short index = 1; unsigned short orphan_blocks; struct page *page = NULL; struct ino_entry *orphan = NULL; @@ -512,11 +512,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); - for (index = 0; index < orphan_blocks; index++) - grab_meta_page(sbi, start_blk + index); - - index = 1; - /* * we don't need to do spin_lock(&im->ino_lock) here, since all the * orphan inode operations are covered under f2fs_lock_op(). @@ -527,12 +522,10 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { - page = find_get_page(META_MAPPING(sbi), start_blk++); - f2fs_bug_on(sbi, !page); + page = grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); - f2fs_put_page(page, 0); } orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); -- cgit v1.2.3 From 037fe70c9a6cebe11ae13402994b844e907ebe0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Jul 2015 17:45:19 +0800 Subject: f2fs: correct return value of ->setxattr This patch fixes to return correct error number of ->setxattr, which is reported by xfstest tests/generic/026 as below: generic/026 - output mismatch --- tests/generic/026.out +++ results/generic/026.out.bad @@ -4,6 +4,6 @@ 1 below acl max acl max 1 above acl max -chacl: cannot set access acl on "largeaclfile": Argument list too long +chacl: cannot set access acl on "largeaclfile": Numerical result out of range use 16 aces use 17 aces ... Ran: generic/026 Failures: generic/026 Failed 1 of 1 tests Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 07449b980acb..4de2286c0e4d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -499,9 +499,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, len = strlen(name); - if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) + if (len > F2FS_NAME_LEN) return -ERANGE; + if (size > MAX_VALUE_LEN(inode)) + return -E2BIG; + base_addr = read_all_xattrs(inode, ipage); if (!base_addr) goto exit; -- cgit v1.2.3 From 8f46dcaea8d9d1552f4071f1ddeeca4427c1d83a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Jul 2015 18:56:10 +0800 Subject: f2fs: expose f2fs_write_cache_pages If there are gced dirty pages and normal dirty pages in the mapping of one inode, we might writeback them alternately with discontinuous block address, resulting in low performance. This patch introduces f2fs_write_cache_pages with codes copied from write_cache_pages in mm/page-writeback.c. In this function, we refactor flow with two steps: 1) writeback all cold type pages. 2) writeback all non-cold type pages. By using this method, f2fs will writeback dirty pages with the same temperature in bunch mode, it makes writeouted block being with more continuous address, so they can be merged as much as possible in f2fs bio cache, and also it will reduce the chance of submiting small IO from block layer. Test environment: 8g nokia sd card (very old sd card, but it shows better effect when testing with this patch, and with a 32g kingston sd card, I didn't see much more improvement). Test step: 1. touch testfile; 2. truncate -s 512K testfile; 3. write all pages with odd index; 4. trigger gc by ioctl; 5. write all pages with even index; 6. time fsync testfile. before: real 0m0.402s user 0m0.000s sys 0m0.000s after: real 0m0.143s user 0m0.004s sys 0m0.004s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef30b59756c6..e58562e70da0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1127,6 +1128,139 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, return ret; } +/* + * This function was copied from write_cche_pages from mm/page-writeback.c. + * The major change is making write step of cold data page separately from + * warm/hot data page. + */ +static int f2fs_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + int tag; + int step = 0; + + pagevec_init(&pvec, 0); +next: + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) { + done = 1; + break; + } + + done_index = page->index; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (step == 0 && !is_cold_data(page)) + goto continue_unlock; + if (step == 1 && is_cold_data(page)) + goto continue_unlock; + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + f2fs_wait_on_page_writeback(page, DATA); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + done_index = page->index + 1; + done = 1; + break; + } + } + + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (step < 1) { + step++; + goto next; + } + + if (!cycled && !done) { + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1157,7 +1291,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, mutex_lock(&sbi->writepages); locked = true; } - ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); f2fs_submit_merged_bio(sbi, DATA, WRITE); if (locked) mutex_unlock(&sbi->writepages); -- cgit v1.2.3 From 1b77c416e7dfe317277057c32baa67ea9e486ae7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Jul 2015 18:31:24 -0700 Subject: f2fs: use a page temporarily for encrypted gced page That encrypted page is used temporarily, so we don't need to mark it accessed. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2701e05af991..fcb263af58b3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -552,7 +552,10 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) fio.page = page; fio.blk_addr = dn.data_blkaddr; - fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr); + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), + fio.blk_addr, + FGP_LOCK|FGP_CREAT, + GFP_NOFS); if (!fio.encrypted_page) goto put_out; -- cgit v1.2.3 From d5e8f6c9800c382cc55d8df801775d51311f8f21 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 15 Jul 2015 17:28:53 +0800 Subject: f2fs: stat inline xattr inode number This patch adds to stat the number of inline xattr inode for showing in debugfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 4 ++++ fs/f2fs/f2fs.h | 16 +++++++++++++++- fs/f2fs/inode.c | 2 ++ fs/f2fs/namei.c | 1 + 4 files changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 75176e0dd6c8..2aeaf4e214db 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -49,6 +49,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->utilization = utilization(sbi); @@ -226,6 +227,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_xattr Inode: %u\n", + si->inline_xattr); seq_printf(s, " - Inline_data Inode: %u\n", si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", @@ -366,6 +369,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; + atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 673623b36901..b18b85267711 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -782,6 +782,7 @@ struct f2fs_sb_info { unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ @@ -1804,7 +1805,8 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages; + int bg_gc, inmem_pages, wb_pages; + int inline_xattr, inline_inode, inline_dir; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -1837,6 +1839,16 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) #define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) #define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_dec_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ @@ -1907,6 +1919,8 @@ void f2fs_destroy_root_stats(void); #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) #define stat_inc_read_hit(sb) +#define stat_inc_inline_xattr(inode) +#define stat_dec_inline_xattr(inode) #define stat_inc_inline_inode(inode) #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 978a7261a791..5b7547f0bdea 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -155,6 +155,7 @@ static int do_read_inode(struct inode *inode) f2fs_put_page(node_page, 1); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -350,6 +351,7 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1856d5ecd809..97e97c41b979 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -67,6 +67,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_init_extent_tree(inode, NULL); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); -- cgit v1.2.3 From 727edac572034557d207b293a47de25145e3d58c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 15 Jul 2015 17:29:49 +0800 Subject: f2fs: use atomic_t to record hit ratio info of extent cache Variables for recording extent cache ratio info were updated without protection, this patch tries to alter them to atomic_t type for more accurate stat. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 7 +++++-- fs/f2fs/extent_cache.c | 6 +++--- fs/f2fs/f2fs.h | 7 ++++--- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2aeaf4e214db..bc215fd6c402 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,8 +33,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_ext = sbi->read_hit_ext; - si->total_ext = sbi->total_hit_ext; + si->hit_ext = atomic_read(&sbi->read_hit_ext); + si->total_ext = atomic_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -369,6 +369,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; + atomic_set(&sbi->total_hit_ext, 0); + atomic_set(&sbi->read_hit_ext, 0); + atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 5f78fc1e818a..362df8cd54d4 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -279,7 +279,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; - stat_inc_read_hit(sbi->sb); + stat_inc_read_hit(sbi); goto out; } @@ -292,10 +292,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, et->cached_en = en; spin_unlock(&sbi->extent_lock); ret = true; - stat_inc_read_hit(sbi->sb); + stat_inc_read_hit(sbi); } out: - stat_inc_total_hit(sbi->sb); + stat_inc_total_hit(sbi); read_unlock(&et->lock); trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b18b85267711..38ba525c3d6f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -781,7 +781,8 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + atomic_t total_hit_ext; /* # of lookup extent cache */ + atomic_t read_hit_ext; /* # of hit extent cache */ atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -1837,8 +1838,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) -#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) +#define stat_inc_read_hit(sbi) (atomic_inc(&(sbi)->read_hit_ext)) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ -- cgit v1.2.3 From 86531d6b84bc096d5d9dbc23333df0ab8d347763 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 15 Jul 2015 13:08:21 -0700 Subject: f2fs: callers take care of the page from bio error This patch changes for a caller to handle the page after its bio gets an error. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/data.c | 27 +++++++++++++-------------- fs/f2fs/node.c | 21 ++++++++++----------- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 60327027137f..6fb696da42e8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -69,8 +69,10 @@ repeat: fio.page = page; - if (f2fs_submit_page_bio(&fio)) + if (f2fs_submit_page_bio(&fio)) { + f2fs_put_page(page, 1); goto repeat; + } lock_page(page); if (unlikely(page->mapping != mapping)) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e58562e70da0..7f51296fbbf6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -156,7 +156,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { bio_put(bio); - f2fs_put_page(page, 1); return -EFAULT; } @@ -292,15 +291,13 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } + if (err) + goto put_err; f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { - f2fs_put_page(page, 1); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto put_err; } got_it: if (PageUptodate(page)) { @@ -325,8 +322,12 @@ got_it: fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) - return ERR_PTR(err); + goto put_err; return page; + +put_err: + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *find_data_page(struct inode *inode, pgoff_t index) @@ -1322,7 +1323,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page, *ipage; + struct page *page = NULL; + struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; @@ -1412,7 +1414,6 @@ put_next: lock_page(page); if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); err = -EIO; goto fail; } @@ -1424,10 +1425,8 @@ put_next: /* avoid symlink page */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { err = f2fs_decrypt_one(inode, page); - if (err) { - f2fs_put_page(page, 1); + if (err) goto fail; - } } } out_update: @@ -1440,8 +1439,8 @@ put_fail: f2fs_put_dnode(&dn); unlock_fail: f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); fail: + f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a05eb35a372c..7dd2b9d78a45 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -991,8 +991,7 @@ fail: /* * Caller should do after getting the following values. * 0: f2fs_put_page(page, 0) - * LOCKED_PAGE: f2fs_put_page(page, 1) - * error: nothing + * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ static int read_node_page(struct page *page, int rw) { @@ -1010,7 +1009,6 @@ static int read_node_page(struct page *page, int rw) if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); - f2fs_put_page(page, 1); return -ENOENT; } @@ -1041,10 +1039,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) return; err = read_node_page(apage, READA); - if (err == 0) - f2fs_put_page(apage, 0); - else if (err == LOCKED_PAGE) - f2fs_put_page(apage, 1); + f2fs_put_page(apage, err ? 1 : 0); } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1057,10 +1052,12 @@ repeat: return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err < 0) + if (err < 0) { + f2fs_put_page(page, 1); return ERR_PTR(err); - else if (err != LOCKED_PAGE) + } else if (err != LOCKED_PAGE) { lock_page(page); + } if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { ClearPageUptodate(page); @@ -1096,10 +1093,12 @@ repeat: return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err < 0) + if (err < 0) { + f2fs_put_page(page, 1); return ERR_PTR(err); - else if (err == LOCKED_PAGE) + } else if (err == LOCKED_PAGE) { goto page_hit; + } blk_start_plug(&plug); -- cgit v1.2.3 From 0f825ee6e873ac0daf5394c5ec76ca2f3d540370 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Wed, 15 Jul 2015 18:05:17 +0800 Subject: f2fs: add new interfaces for extent tree Add a lookup and a insertion interface for extent tree. The new lookup return the insert position and the prev/next extents closest to the offset we lookup when find no match. The new insertion uses above parameters to improve performance. There are three possible insertions after the lookup in f2fs_update_extent_tree, two of them insert parts of removed extent back to tree, since no merge happens during this process, new insertion skips the merge check in this scanario; the another insertion inserts a new extent to tree, new insertion uses prev/next extent and insert position to insert this extent directly, and save the time of searching down the tree. As long as tree remains unchanged between lookup and insertion, this would work fine. And the new lookup would be useful when add multi-blocks extent support for insertion interface. Signed-off-by: Fan li Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 362df8cd54d4..32fae8ad5b7e 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -302,6 +302,126 @@ out: return ret; } + +/* + * lookup extent at @fofs, if hit, return the extent + * if not, return NULL and + * @prev_ex: extent before fofs + * @next_ex: extent after fofs + * @insert_p: insert point for new extent at fofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, + unsigned int fofs, struct extent_node **prev_ex, + struct extent_node **next_ex, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &et->root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct extent_node *en; + + if (et->cached_en) { + struct extent_info *cei = &et->cached_en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + return et->cached_en; + } + + while (*pnode) { + parent = *pnode; + en = rb_entry(*pnode, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) + pnode = &(*pnode)->rb_left; + else if (fofs >= en->ei.fofs + en->ei.len) + pnode = &(*pnode)->rb_right; + else + return en; + } + + *insert_p = pnode; + *insert_parent = parent; + + en = rb_entry(parent, struct extent_node, rb_node); + tmp_node = parent; + if (parent && fofs > en->ei.fofs) + tmp_node = rb_next(parent); + *next_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + + tmp_node = parent; + if (parent && fofs < en->ei.fofs) + tmp_node = rb_prev(parent); + *prev_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + + return NULL; +} + +static struct extent_node *__insert_extent_tree_ret(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den, + struct extent_node *prev_ex, + struct extent_node *next_ex, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; + int merged = 0; + + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + f2fs_bug_on(sbi, !den); + merged = 1; + prev_ex->ei.len += ei->len; + ei = &prev_ex->ei; + en = prev_ex; + } + if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + f2fs_bug_on(sbi, !den); + if (merged++) { + __detach_extent_node(sbi, et, prev_ex); + *den = prev_ex; + } + next_ex->ei.fofs = ei->fofs; + next_ex->ei.blk = ei->blk; + next_ex->ei.len += ei->len; + en = next_ex; + } + if (merged) + goto update_out; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) + p = &(*p)->rb_left; + else if (ei->fofs >= en->ei.fofs + en->ei.len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } +do_insert: + en = __attach_extent_node(sbi, et, ei, parent, p); + if (!en) + return NULL; +update_out: + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + return en; +} + /* return true, if on-disk extent should be updated */ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, block_t blkaddr) @@ -309,8 +429,9 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; - struct extent_node *den = NULL; + struct extent_node *den = NULL, *prev_ex = NULL, *next_ex = NULL; struct extent_info ei, dei, prev; + struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int endofs; if (!et) @@ -332,20 +453,22 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, f2fs_drop_largest_extent(inode, fofs); /* 1. lookup and remove existing extent info in cache */ - en = __lookup_extent_tree(et, fofs); + en = __lookup_extent_tree_ret(et, fofs, &prev_ex, &next_ex, + &insert_p, &insert_parent); if (!en) goto update_extent; dei = en->ei; __detach_extent_node(sbi, et, en); - /* 2. if extent can be split more, split and insert the left part */ + /* 2. if extent can be split, try to split it */ if (dei.len > F2FS_MIN_EXTENT_LEN) { /* insert left part of split extent into cache */ if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { set_extent_info(&ei, dei.fofs, dei.blk, - fofs - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, NULL); + fofs - dei.fofs); + en1 = __insert_extent_tree_ret(sbi, et, &ei, NULL, + NULL, NULL, NULL, NULL); } /* insert right part of split extent into cache */ @@ -353,7 +476,8 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { set_extent_info(&ei, fofs + 1, fofs - dei.fofs + dei.blk + 1, endofs - fofs); - en2 = __insert_extent_tree(sbi, et, &ei, NULL); + en2 = __insert_extent_tree_ret(sbi, et, &ei, NULL, + NULL, NULL, NULL, NULL); } } @@ -361,7 +485,8 @@ update_extent: /* 3. update extent in extent cache */ if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, 1); - en3 = __insert_extent_tree(sbi, et, &ei, &den); + en3 = __insert_extent_tree_ret(sbi, et, &ei, &den, + prev_ex, next_ex, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && -- cgit v1.2.3 From ecbaa4068f88f96a8ffde37d532e618508394b53 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Jul 2015 18:18:11 +0800 Subject: f2fs: reduce region of cp_rwsem covered in f2fs_do_collapse In f2fs_do_collapse, region cp_rwsem covered is large, since it will be held until all blocks are left shifted, so if we try to collapse small area at the beginning of large file, checkpoint who want to grab writer's lock of cp_rwsem will be delayed for long time. In order to avoid this condition, altering to lock/unlock cp_rwsem each shift operation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9c40f8cfb77c..d0114710648e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -800,11 +800,11 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; int ret = 0; - f2fs_lock_op(sbi); - for (; end < nrpages; start++, end++) { block_t new_addr, old_addr; + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { @@ -820,13 +820,16 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) if (new_addr == NULL_ADDR) { set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT) { goto out; - else if (ret == -ENOENT) + } else if (ret == -ENOENT) { + f2fs_unlock_op(sbi); continue; + } if (dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); continue; } else { truncate_data_blocks_range(&dn, 1); @@ -865,8 +868,9 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) f2fs_put_dnode(&dn); } + f2fs_unlock_op(sbi); } - ret = 0; + return 0; out: f2fs_unlock_op(sbi); return ret; -- cgit v1.2.3 From 55f57d2c4259a9a4048cf4629a2c6ba53729188d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Jul 2015 18:19:02 +0800 Subject: f2fs: fix double lock in handle_failed_inode In handle_failed_inode, there is a potential deadlock which can happen in below call path: - f2fs_create - f2fs_lock_op down_read(cp_rwsem) - f2fs_add_link - __f2fs_add_link - init_inode_metadata - f2fs_init_security failed - truncate_blocks failed - handle_failed_inode - f2fs_truncate - truncate_blocks(..,true) - write_checkpoint - block_operations - f2fs_lock_all down_write(cp_rwsem) - f2fs_lock_op down_read(cp_rwsem) So in this path, we pass parameter to f2fs_truncate to make sure cp_rwsem in truncate_blocks will not be locked again. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 6 +++--- fs/f2fs/inode.c | 4 ++-- fs/f2fs/super.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 38ba525c3d6f..e73f2e2453f9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1579,7 +1579,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -void f2fs_truncate(struct inode *); +void f2fs_truncate(struct inode *, bool); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d0114710648e..15df014aadc7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -579,7 +579,7 @@ out: return err; } -void f2fs_truncate(struct inode *inode) +void f2fs_truncate(struct inode *inode, bool lock) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -593,7 +593,7 @@ void f2fs_truncate(struct inode *inode) return; } - if (!truncate_blocks(inode, i_size_read(inode), true)) { + if (!truncate_blocks(inode, i_size_read(inode), lock)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } @@ -656,7 +656,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode); + f2fs_truncate(inode, true); f2fs_balance_fs(F2FS_I_SB(inode)); } else { /* diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5b7547f0bdea..cc4f1082419a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -343,7 +343,7 @@ void f2fs_evict_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + f2fs_truncate(inode, true); f2fs_lock_op(sbi); remove_inode_page(inode); @@ -385,7 +385,7 @@ void handle_failed_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + f2fs_truncate(inode, false); remove_inode_page(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0083b8559c9b..12eb69dd38af 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -467,7 +467,7 @@ static int f2fs_drop_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + f2fs_truncate(inode, true); sb_end_intwrite(inode->i_sb); -- cgit v1.2.3 From 737f18992ee81cab897336e84c5c7f4e179dfd61 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 17 Jul 2015 12:56:00 +0800 Subject: f2fs: optimize f2fs_write_cache_pages The if statement "goto continue_unlock" is exactly the same when each if condition is true that is depended on the value of both "step" and "is_cold_data(page)" are 0 or 1. That means when the value of "step" equals to "is_cold_data(page)", the if condition is true and the if statement "goto continue_unlock" appears only once, so it can be optimized to reduce the duplicated code. Signed-off-by: Tiezhu Yang Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7f51296fbbf6..801b0b0b08f4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1207,9 +1207,7 @@ continue_unlock: goto continue_unlock; } - if (step == 0 && !is_cold_data(page)) - goto continue_unlock; - if (step == 1 && is_cold_data(page)) + if (step == is_cold_data(page)) goto continue_unlock; if (PageWriteback(page)) { -- cgit v1.2.3 From 6a2905443cf27f9c14889428f14fccfb98ed97f4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 17 Jul 2015 18:02:39 +0800 Subject: f2fs: skip writing in ->writepages when no dirty pages exist When flushing comes from background, if there is no dirty page in the mapping of inode, we'd better to skip seeking dirty page from mapping for writebacking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 801b0b0b08f4..e4081fc91012 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1275,6 +1275,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!mapping->a_ops->writepage) return 0; + /* skip writing if there is no dirty page in this inode */ + if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) + return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) -- cgit v1.2.3 From a5f64b6aa69b5cc05e198291811a2f3faf95b463 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 17 Jul 2015 18:05:21 +0800 Subject: f2fs: fix to wait all atomic written pages writeback This patch fixes the incorrect range (0, LONG_MAX) which is used in ranged fsync. If we use LONG_MAX as the parameter for indicating the end of file we want to synchronize, in 32-bits architecture machine, these datas after 4GB offset may not be persisted in storage after ->fsync returned. Here, we alter LONG_MAX to LLONG_MAX to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 15df014aadc7..d4da7fec757d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1357,7 +1357,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) commit_inmem_pages(inode, false); } - ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); mnt_drop_write_file(filp); return ret; } -- cgit v1.2.3 From f4c9c743acedc2f083e6a1d4e186df6a2c12b2fd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 17 Jul 2015 18:06:35 +0800 Subject: f2fs: convert inline data before set atomic/volatile flag In f2fs_ioc_start_{atomic,volatile}_write, if we failed in converting inline data, we will report error to user, but still remain atomic/volatile flag in inode, it will impact further writes for this file. Fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d4da7fec757d..25d1a2f501dc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1323,6 +1323,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -1332,9 +1333,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) return 0; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - return f2fs_convert_inline_inode(inode); + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + return 0; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1365,6 +1369,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) static int f2fs_ioc_start_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -1372,9 +1377,12 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (f2fs_is_volatile_file(inode)) return 0; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - return f2fs_convert_inline_inode(inode); + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + return 0; } static int f2fs_ioc_release_volatile_write(struct file *filp) -- cgit v1.2.3 From e4e762723a90109c968c6c58f7d9bf4541c22928 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 24 Jul 2015 18:24:45 +0800 Subject: f2fs: fix inline data/dentry stat number leak If we clear inline data/dentry flag in handle_failed_inode, we will fail to decline the stat count of inline data/dentry in f2fs_evict_inode due to no flag in inode. So remove the wrong clearing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cc4f1082419a..83354433d4d1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -390,8 +390,6 @@ void handle_failed_inode(struct inode *inode) remove_inode_page(inode); set_inode_flag(F2FS_I(inode), FI_FREE_NID); - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); f2fs_unlock_op(sbi); /* iput will drop the inode object */ -- cgit v1.2.3 From a6d494b6d84697f954aaade204e8a5843078a94f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 24 Jul 2015 18:26:26 +0800 Subject: f2fs: fix to build free nids from readaheaded nat pages When there is no enough free nids in free nid cache, we will try to readahead FREE_NID_PAGES:4 nat pages into page cache of meta_inode, then, reading nat entries in nat page for adding free nids to free nid cache. But when traversing all nat pages we readaheaded in a circulation, our exit condition is not set right, one more nat page will be scanned without readaheading, resulting worse read performance. This patch fixes to read the correct number nat pages to avoid bad performance. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7dd2b9d78a45..ac9110788b17 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1532,7 +1532,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) if (unlikely(nid >= nm_i->max_nid)) nid = 0; - if (i++ == FREE_NID_PAGES) + if (++i >= FREE_NID_PAGES) break; } -- cgit v1.2.3 From edb27deea7cabfff8feb8c62aae647b7673be734 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 25 Jul 2015 00:52:52 -0700 Subject: f2fs: handle error cases in commit_inmem_pages This patch adds to handle error cases in commit_inmem_pages. If an error occurs, it stops to write the pages and return the error right away. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 7 +++++-- fs/f2fs/segment.c | 10 ++++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e73f2e2453f9..58b05b541a4e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1697,7 +1697,7 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -void commit_inmem_pages(struct inode *, bool); +int commit_inmem_pages(struct inode *, bool); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 25d1a2f501dc..be69a01060a6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1358,10 +1358,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - commit_inmem_pages(inode, false); + ret = commit_inmem_pages(inode, false); + if (ret) + goto err_out; } ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); +err_out: mnt_drop_write_file(filp); return ret; } @@ -1418,7 +1421,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - commit_inmem_pages(inode, false); + commit_inmem_pages(inode, true); } if (f2fs_is_volatile_file(inode)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f7bfc3b7d934..509a2c4bb7d3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -227,7 +227,7 @@ retry: trace_f2fs_register_inmem_page(page, INMEM); } -void commit_inmem_pages(struct inode *inode, bool abort) +int commit_inmem_pages(struct inode *inode, bool abort) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -239,6 +239,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + int err = 0; /* * The abort is true only when f2fs_evict_inode is called. @@ -263,8 +264,12 @@ void commit_inmem_pages(struct inode *inode, bool abort) inode_dec_dirty_pages(inode); trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; - do_write_data_page(&fio); + err = do_write_data_page(&fio); submit_bio = true; + if (err) { + unlock_page(cur->page); + break; + } } f2fs_put_page(cur->page, 1); } else { @@ -283,6 +288,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (submit_bio) f2fs_submit_merged_bio(sbi, DATA, WRITE); } + return err; } /* -- cgit v1.2.3 From 5768dcdd7f7675f9540e648428c8a1cd7208a0fe Mon Sep 17 00:00:00 2001 From: Fan Li Date: Tue, 4 Aug 2015 13:27:51 +0800 Subject: f2fs: change the timing of f2fs_wait_on_page_writeback some backing devices need pages to be stable during writeback. It doesn't matter if the page is completely overwritten or already uptodate, it needs to wait before write. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e4081fc91012..2692848e7f75 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1383,13 +1383,13 @@ put_next: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_wait_on_page_writeback(page, DATA); + if (len == PAGE_CACHE_SIZE) goto out_update; if (PageUptodate(page)) goto out_clear; - f2fs_wait_on_page_writeback(page, DATA); - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { unsigned start = pos & (PAGE_CACHE_SIZE - 1); unsigned end = start + len; -- cgit v1.2.3 From f3f338caad3428fbc4bb563828efc6ecce4d956b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 29 Jul 2015 17:33:13 +0800 Subject: f2fs: freeze filesystem when fail to update meta page due to IO error In get_meta_page, we guarantee no failure for the returned page, but sometimes, IO error from device will incur returning an non-updated page. Then, we still use this page as updated one, exception could happen when using this kind of page. So in this condition, we'd better freeze fs by making fs readonly and and stop doing checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6fb696da42e8..9c1acf69bfbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -79,6 +79,14 @@ repeat: f2fs_put_page(page, 1); goto repeat; } + + /* + * if there is any IO error when accessing device, make our filesystem + * readonly and make sure do not write checkpoint with non-uptodate + * meta page. + */ + if (unlikely(!PageUptodate(page))) + f2fs_stop_checkpoint(sbi); out: return page; } -- cgit v1.2.3 From 7a04f64d4d5367ade827d75388d66054b535e201 Mon Sep 17 00:00:00 2001 From: Liu Xue Date: Mon, 27 Jul 2015 10:17:59 +0000 Subject: f2fs: unify f2fs_bug_on when check blocks and segment Replace BUG_ON with f2fs_bug_on to deal with block and segment validity check failed. Signed-off-by: Xue Liu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 45 ++++++++------------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 79e7b879a753..230f9cd9fa2a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -555,16 +555,15 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) return curseg->next_blkoff; } -#ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { - BUG_ON(segno > TOTAL_SEGS(sbi) - 1); + f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi)); - BUG_ON(blk_addr >= MAX_BLKADDR(sbi)); + f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -577,12 +576,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, int valid_blocks = 0; int cur_pos = 0, next_pos; - /* check segment usage */ - BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); - - /* check boundary of a given segment number */ - BUG_ON(segno > TOTAL_SEGS(sbi) - 1); + /* check segment usage, and check boundary of a given segment number */ + f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1); +#ifdef CONFIG_F2FS_CHECK_FS /* check bitmap with valid block count */ do { if (is_valid) { @@ -598,35 +596,8 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); -} -#else -static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) -{ - if (segno > TOTAL_SEGS(sbi) - 1) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} - -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) -{ - if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} - -/* - * Summary block is always treated as an invalid block - */ -static inline void check_block_count(struct f2fs_sb_info *sbi, - int segno, struct f2fs_sit_entry *raw_sit) -{ - /* check segment usage */ - if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) - set_sbi_flag(sbi, SBI_NEED_FSCK); - - /* check boundary of a given segment number */ - if (segno > TOTAL_SEGS(sbi) - 1) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} #endif +} static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) -- cgit v1.2.3 From 470f00e9686f0b338a457568229fe7b7d44b8e6a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Jul 2015 18:14:06 +0800 Subject: f2fs: fix to release inode page correctly In following call path, we will pass a locked and referenced ipage pointer to get_new_data_page: - init_inode_metadata - make_empty_dir - get_new_data_page There are two exit paths in get_new_data_page when error occurs: 1) grab_cache_page fails, ipage will not be released; 2) f2fs_reserve_block fails, ipage will be released in callee. So, it's not consistent for error handling in get_new_data_page. For f2fs_reserve_block, it's not very easy to change the rule of error handling, since it's already complicated. Here we deside to choose an easy way to fix this issue: If any error occur in get_new_data_page, we will ensure releasing ipage in this function. The same issue is in f2fs_convert_inline_dir, fix that too. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++-- fs/f2fs/inline.c | 13 ++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2692848e7f75..f8f93db437ce 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -388,7 +388,8 @@ repeat: * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). - * Note that, ipage is set only by make_empty_dir. + * Note that, ipage is set only by make_empty_dir, and if any error occur, + * ipage should be released by this function. */ struct page *get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) @@ -399,8 +400,14 @@ struct page *get_new_data_page(struct inode *inode, int err; repeat: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + /* + * before exiting, we should make sure ipage will be released + * if any error occur. + */ + f2fs_put_page(ipage, 1); return ERR_PTR(-ENOMEM); + } set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a13ffcc32992..79d18d5c1fae 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -360,6 +360,10 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, return 0; } +/* + * NOTE: ipage is grabbed by caller, but if any error occurs, we should + * release ipage in this function. + */ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { @@ -369,8 +373,10 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, int err; page = grab_cache_page(dir->i_mapping, 0); - if (!page) + if (!page) { + f2fs_put_page(ipage, 1); return -ENOMEM; + } set_new_dnode(&dn, dir, ipage, NULL, 0); err = f2fs_reserve_block(&dn, 0); @@ -434,8 +440,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, slots, NR_INLINE_DENTRY); if (bit_pos >= NR_INLINE_DENTRY) { err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); - if (!err) - err = -EAGAIN; + if (err) + return err; + err = -EAGAIN; goto out; } -- cgit v1.2.3 From e90c2d2850d9d034e814a328725a4b15878f0357 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Jul 2015 18:36:47 +0800 Subject: f2fs: invalidate temporary meta page To avoid meeting garbage data in next free node block at the end of warm node chain when doing recovery, we will try to zero out that invalid block. If the device is not support discard, our way for zeroing out block is: grabbing a temporary zeroed page in meta inode, then, issue write request with this page. But, we forget to release that temporary page, so our memory usage will increase without gaining any hit ratio benefit, so it's better to free it for saving memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 13 ++++++++++++- fs/f2fs/f2fs.h | 2 +- fs/f2fs/recovery.c | 11 ++++++++++- fs/f2fs/segment.c | 9 ++++++--- 4 files changed, 29 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9c1acf69bfbb..c3111769d382 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -896,12 +896,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); + block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); + bool invalidate = false; /* * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); + if (discard_next_dnode(sbi, discard_blk)) + invalidate = true; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1030,6 +1033,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); + /* + * invalidate meta page which is used temporarily for zeroing out + * block at the end of warm node chain. + */ + if (invalidate) + invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, + discard_blk); + release_dirty_inode(sbi); if (unlikely(f2fs_cp_error(sbi))) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58b05b541a4e..34a524d007ec 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1707,7 +1707,7 @@ void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *, block_t); +bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 24a8c1d4f45f..07a36e413ace 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -561,11 +561,20 @@ out: clear_sbi_flag(sbi, SBI_POR_DOING); if (err) { - discard_next_dnode(sbi, blkaddr); + bool invalidate = false; + + if (discard_next_dnode(sbi, blkaddr)) + invalidate = true; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) sync_meta_pages(sbi, META, LONG_MAX); + + /* invalidate temporary meta page */ + if (invalidate) + invalidate_mapping_pages(META_MAPPING(sbi), + blkaddr, blkaddr); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 509a2c4bb7d3..1f1200487c44 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -514,7 +514,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) +bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { int err = -ENOTSUPP; @@ -524,13 +524,16 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->discard_map)) - return; + return false; err = f2fs_issue_discard(sbi, blkaddr, 1); } - if (err) + if (err) { update_meta_page(sbi, NULL, blkaddr); + return true; + } + return false; } static void __add_discard_entry(struct f2fs_sb_info *sbi, -- cgit v1.2.3 From 759af1c9c16fec5323111b799ce25a3d8864df7e Mon Sep 17 00:00:00 2001 From: Fan Li Date: Wed, 5 Aug 2015 15:52:16 +0800 Subject: f2fs: use extent cache to optimize f2fs_reserve_block In some cases, we only need the block address when we call f2fs_reserve_block, other fields of struct dnode_of_data aren't necessary. We can try extent cache first for such cases in order to speed up the process. Signed-off-by: Fan li Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 16 +++++++++++++++- fs/f2fs/f2fs.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8f93db437ce..4fabdd47490a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -263,6 +263,19 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) +{ + struct extent_info ei; + struct inode *inode = dn->inode; + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn->data_blkaddr = ei.blk + index - ei.fofs; + return 0; + } + + return f2fs_reserve_block(dn, index); +} + struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) { struct address_space *mapping = inode->i_mapping; @@ -1383,7 +1396,8 @@ repeat: if (err) goto put_fail; } - err = f2fs_reserve_block(&dn, index); + + err = f2fs_get_block(&dn, index); if (err) goto put_fail; put_next: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 34a524d007ec..09cb365a07cc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1768,6 +1768,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); +int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int); struct page *find_data_page(struct inode *, pgoff_t); -- cgit v1.2.3 From 12a8343e99a8af50b2a1cd8da72d34b6e860da0f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Aug 2015 17:23:54 +0800 Subject: f2fs: recover invalid/reserved block address for fsynced file When testing with generic/101 in xfstests, error message outputed as below: --- tests/generic/101.out +++ results//generic/101.out.bad @@ -10,10 +10,14 @@ File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * -0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb * 0372000 ... (Run 'diff -u tests/generic/101.out results/generic/101.out.bad' to see the entire diff) The test flow is like below: 1. pwrite foo -S 0xaa 0 64K 2. pwrite foo -S 0xbb 64K 61K 3. sync 4. truncate foo 64K 5. truncate foo 125K 6. fsync foo 7. flakey drop writes 8. umount After this test, we expect the data of recovered file will have the first 64k of data filling with value 0xaa and the next 61k of data filling with value 0x00 because we have fsynced it before dropping writes in dm. In f2fs, during recovering, we will only recover the valid block address in direct node page if it is marked as a fsynced dnode, but block address which means invalid/reserved (with value NULL_ADDR/NEW_ADDR) will not be recovered. So, the file recovered shows its incorrect data 0xbb in range of [61k, 125k]. In this patch, we fix to recover invalid/reserved block during recover flow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 07a36e413ace..d2ef0c9f53e7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -399,14 +399,35 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); - for (; start < end; start++) { + for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; src = datablock_addr(dn.node_page, dn.ofs_in_node); dest = datablock_addr(page, dn.ofs_in_node); - if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && - is_valid_blkaddr(sbi, dest, META_POR)) { + /* skip recovering if dest is the same as src */ + if (src == dest) + continue; + + /* dest is invalid, just invalidate src block */ + if (dest == NULL_ADDR) { + truncate_data_blocks_range(&dn, 1); + continue; + } + + /* + * dest is reserved block, invalidate src block + * and then reserve one new block in dnode page. + */ + if (dest == NEW_ADDR) { + truncate_data_blocks_range(&dn, 1); + err = reserve_new_block(&dn); + f2fs_bug_on(sbi, err); + continue; + } + + /* dest is valid block, try to recover from src to dest */ + if (is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = reserve_new_block(&dn); @@ -424,7 +445,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, ni.version, false); recovered++; } - dn.ofs_in_node++; } if (IS_INODE(dn.node_page)) -- cgit v1.2.3 From 6394328ab8a2ab6b127ae85f716943d92595878d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Aug 2015 18:36:06 +0800 Subject: f2fs: report error of fill_zero fill_zero can fail due to a lot of reason, but previously we do not handle its return value, so its callers such as punch_hole/f2fs_zero_range may report success, but actually can fail because of error occurs inside fill_zero. This patch fixes to report correct return value of fill_zero. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 56 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 18 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index be69a01060a6..016ed3ba2ca4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -695,14 +695,14 @@ const struct inode_operations f2fs_file_inode_operations = { .fiemap = f2fs_fiemap, }; -static void fill_zero(struct inode *inode, pgoff_t index, +static int fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; if (!len) - return; + return 0; f2fs_balance_fs(sbi); @@ -710,12 +710,14 @@ static void fill_zero(struct inode *inode, pgoff_t index, page = get_new_data_page(inode, NULL, index, false); f2fs_unlock_op(sbi); - if (!IS_ERR(page)) { - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, start, len); - set_page_dirty(page); - f2fs_put_page(page, 1); - } + if (IS_ERR(page)) + return PTR_ERR(page); + + f2fs_wait_on_page_writeback(page, DATA); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; } int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) @@ -763,14 +765,22 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, + ret = fill_zero(inode, pg_start, off_start, off_end - off_start); + if (ret) + return ret; } else { - if (off_start) - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); - if (off_end) - fill_zero(inode, pg_end, 0, off_end); + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (ret) + return ret; + } + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + return ret; + } if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; @@ -961,14 +971,21 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, off_end - off_start); + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + if (offset + len > new_size) new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + ret = fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (ret) + return ret; + new_size = max_t(loff_t, new_size, pg_start << PAGE_CACHE_SHIFT); } @@ -1010,7 +1027,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } if (off_end) { - fill_zero(inode, pg_end, 0, off_end); + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + goto out; + new_size = max_t(loff_t, new_size, offset + len); } } -- cgit v1.2.3 From c15e8599ffe1b4f866691424d07037c467c23a2f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Aug 2015 18:39:32 +0800 Subject: f2fs: report EINVAL for unalignment direct IO We run ltp testcase with f2fs and obtain a TFAIL in diotest4, the result in detail is as fallow: dio04 <<>> tag=dio04 stime=1432278894 cmdline="diotest4" contacts="" analysis=exit <<>> diotest4 1 TPASS : Negative Offset diotest4 2 TPASS : removed diotest4 3 TFAIL : diotest4.c:129: write allows odd count.returns 1: Success diotest4 4 TFAIL : diotest4.c:183: Odd count of read and write diotest4 5 TPASS : Read beyond the file size ...... the result of ext4 with same environment: dio04 <<>> tag=dio04 stime=1432259643 cmdline="diotest4" contacts="" analysis=exit <<>> diotest4 1 TPASS : Negative Offset diotest4 2 TPASS : removed diotest4 3 TPASS : Odd count of read and write diotest4 4 TPASS : Read beyond the file size ...... The reason is that when triggering DIO in f2fs, we will return zero value in ->direct_IO if writer's buffer offset, file offset and transfer size is not alignment to block size of filesystem, resulting in falling back into buffered write instead of returning -EINVAL. This patch fixes that problem by returning correct error number for above case, and removing the judgement condition in check_direct_IO to make sure the verification will be enabled for direct reader too. Besides, Jaegeuk Kim pointed out that there is expectional cases we should always make direct-io falling back into buffered write, such as dio in encrypted file. Signed-off-by: Yunlei He [Chao Yu make small change and add detail description in commit message] Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4fabdd47490a..7ea8eda8f137 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1494,9 +1494,6 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, { unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - if (iov_iter_rw(iter) == READ) - return 0; - if (offset & blocksize_mask) return -EINVAL; @@ -1525,8 +1522,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (check_direct_IO(inode, iter, offset)) - return 0; + err = check_direct_IO(inode, iter, offset); + if (err) + return err; trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); -- cgit v1.2.3 From decd36b6c43a1051bab97571cf4c0ec8450268b0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Aug 2015 18:42:09 +0800 Subject: f2fs: remove inmem radix tree Previously, we use radix tree to index all registered page entries for atomic file, but now we only use radix tree to see whether current page is indexed or not, since the other user of radix tree is gone in commit 042b7816aaeb ("f2fs: remove unnecessary call to invalidate inmemory pages"). So in this patch, we try to use one more efficient way: Introducing a macro ATOMIC_WRITTEN_PAGE, and setting it as page private value to indicate page indexing status. By using this way, we can save memory and lookup time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 ++++++++++++++++++-- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 25 +++++++++---------------- fs/f2fs/segment.h | 9 +++++++++ fs/f2fs/super.c | 1 - 5 files changed, 36 insertions(+), 20 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7ea8eda8f137..cad9ebe45692 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1558,6 +1558,11 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, else inode_dec_dirty_pages(inode); } + + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return; + ClearPagePrivate(page); } @@ -1567,6 +1572,10 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return 0; + ClearPagePrivate(page); return 1; } @@ -1581,8 +1590,15 @@ static int f2fs_set_data_page_dirty(struct page *page) SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { - register_inmem_page(inode, page); - return 1; + if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + register_inmem_page(inode, page); + return 1; + } + /* + * Previously, this page has been registered, we just + * return here. + */ + return 0; } if (!PageDirty(page)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 09cb365a07cc..38847942edeb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -424,7 +424,6 @@ struct f2fs_inode_info { unsigned long long xattr_ver; /* cp version of xattr modification */ struct inode_entry *dirty_dir; /* the pointer of dirty dir */ - struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1f1200487c44..7d53cb44c617 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -197,28 +197,20 @@ void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; - int err; - SetPagePrivate(page); f2fs_trace_pid(page); + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + SetPagePrivate(page); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); /* add atomic page indices to the list */ new->page = page; INIT_LIST_HEAD(&new->list); -retry: + /* increase reference count with clean state */ mutex_lock(&fi->inmem_lock); - err = radix_tree_insert(&fi->inmem_root, page->index, new); - if (err == -EEXIST) { - mutex_unlock(&fi->inmem_lock); - kmem_cache_free(inmem_entry_slab, new); - return; - } else if (err) { - mutex_unlock(&fi->inmem_lock); - goto retry; - } get_page(page); list_add_tail(&new->list, &fi->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); @@ -255,8 +247,8 @@ int commit_inmem_pages(struct inode *inode, bool abort) mutex_lock(&fi->inmem_lock); list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + lock_page(cur->page); if (!abort) { - lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); @@ -271,12 +263,13 @@ int commit_inmem_pages(struct inode *inode, bool abort) break; } } - f2fs_put_page(cur->page, 1); } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - put_page(cur->page); } - radix_tree_delete(&fi->inmem_root, cur->page->index); + set_page_private(cur->page, 0); + ClearPagePrivate(cur->page); + f2fs_put_page(cur->page, 1); + list_del(&cur->list); kmem_cache_free(inmem_entry_slab, cur); dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 230f9cd9fa2a..d0bd952b7065 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -177,6 +177,15 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE 0x0000ffff + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) + struct inmem_pages { struct list_head list; struct page *page; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 12eb69dd38af..a79b6b5a4eeb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -423,7 +423,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); - INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); -- cgit v1.2.3 From 47e70ca46f9074efe6573263c0de5bef0af829de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Aug 2015 10:17:27 -0700 Subject: f2fs: do not assign a new segment for dio under space shortage If there is not enough free segment, we should not assign a new segment explicitly. Otherwise, we can run out of free segment. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d53cb44c617..bf1605dbce93 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1225,7 +1225,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_lock(&sit_i->sentry_lock); /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff) + if (direct_io && curseg->next_blkoff && + !has_not_enough_free_secs(sbi, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); -- cgit v1.2.3 From 8c14bfadeac2a01b305ef4434907295b81b58db2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Aug 2015 17:58:43 +0800 Subject: f2fs: handle error of f2fs_iget correctly In recover_orphan_inode, whenever f2fs_iget fail, we will make kernel panic, but it's not reasonable, because f2fs_iget can fail due to a lot of reasons including out of memory. So we change error handling method as below: a) when finding no entry for the orphan inode, bug_on for catching bugs; b) for other reasons, report it to caller. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 31 ++++++++++++++++++++++++------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 4 +++- 3 files changed, 28 insertions(+), 9 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c3111769d382..0958c8399d8e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -468,22 +468,34 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) __remove_ino_entry(sbi, ino, ORPHAN_INO); } -static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct inode *inode = f2fs_iget(sbi->sb, ino); - f2fs_bug_on(sbi, IS_ERR(inode)); + struct inode *inode; + + inode = f2fs_iget(sbi->sb, ino); + if (IS_ERR(inode)) { + /* + * there should be a bug that we can't find the entry + * to orphan inode. + */ + f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); + return PTR_ERR(inode); + } + clear_nlink(inode); /* truncate all the data during iput */ iput(inode); + return 0; } -void recover_orphan_inodes(struct f2fs_sb_info *sbi) +int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; + int err; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return; + return 0; set_sbi_flag(sbi, SBI_POR_DOING); @@ -499,14 +511,19 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); - recover_orphan_inode(sbi, ino); + err = recover_orphan_inode(sbi, ino); + if (err) { + f2fs_put_page(page, 1); + clear_sbi_flag(sbi, SBI_POR_DOING); + return err; + } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); clear_sbi_flag(sbi, SBI_POR_DOING); - return; + return 0; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 38847942edeb..cc07b1595a92 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1748,7 +1748,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -void recover_orphan_inodes(struct f2fs_sb_info *); +int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a79b6b5a4eeb..4db5cd9fb4b9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1245,7 +1245,9 @@ try_onemore: f2fs_join_shrinker(sbi); /* if there are nt orphan nodes free them */ - recover_orphan_inodes(sbi); + err = recover_orphan_inodes(sbi); + if (err) + goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); -- cgit v1.2.3 From 4c278394b0feb7aadc538be12ab0474b106a7255 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Aug 2015 16:01:30 -0700 Subject: f2fs: avoid a build warning If F2FS_CHECK_FS is turned off, we can get a build warning for unused variable. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d0bd952b7065..b6e4ed15c698 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -581,15 +581,11 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) static inline void check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { +#ifdef CONFIG_F2FS_CHECK_FS bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; - /* check segment usage, and check boundary of a given segment number */ - f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg - || segno > TOTAL_SEGS(sbi) - 1); - -#ifdef CONFIG_F2FS_CHECK_FS /* check bitmap with valid block count */ do { if (is_valid) { @@ -606,6 +602,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); #endif + /* check segment usage, and check boundary of a given segment number */ + f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1); } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, -- cgit v1.2.3 From 315df8398e36360c0be62e6fdd3f2708fc3a2567 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Aug 2015 12:45:39 -0700 Subject: f2fs: do not write any node pages related to orphan inodes We should not write node pages when deleting orphan inodes. In order to do that, we can eaisly set POR_DOING flag earlier before entering orphan inode routine. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ---- fs/f2fs/recovery.c | 4 +--- fs/f2fs/super.c | 6 +++++- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0958c8399d8e..890e4d4c39d7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -497,8 +497,6 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) return 0; - set_sbi_flag(sbi, SBI_POR_DOING); - start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -514,7 +512,6 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - clear_sbi_flag(sbi, SBI_POR_DOING); return err; } } @@ -522,7 +519,6 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - clear_sbi_flag(sbi, SBI_POR_DOING); return 0; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d2ef0c9f53e7..faec2ca004b9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -545,14 +545,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&inode_list); - /* step #1: find fsynced inode numbers */ - set_sbi_flag(sbi, SBI_POR_DOING); - /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4db5cd9fb4b9..cfe3f9579934 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1165,7 +1165,9 @@ try_onemore: mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); - clear_sbi_flag(sbi, SBI_POR_DOING); + + /* disallow all the data/node/meta page writes */ + set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->read_io.io_rwsem); @@ -1309,6 +1311,8 @@ try_onemore: goto free_kobj; } } + /* recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); /* * If filesystem is not mounted as read-only then -- cgit v1.2.3 From 206e61be29624499af46546076e835da93e6bde5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Aug 2015 17:48:21 +0800 Subject: f2fs: avoid clear valid page In f2fs_delete_entry, if last dirent is remove from the dentry page, we will try to punch that page since it has no valid date in it. But truncate_hole which is used for punching could fail because of no memory or IO error, if that happened, we'd better skip clearing this valid dentry page. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a34ebd8312ab..8f15fc134040 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -718,8 +718,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode) f2fs_drop_nlink(dir, inode, NULL); - if (bit_pos == NR_DENTRY_IN_BLOCK) { - truncate_hole(dir, page->index, page->index + 1); + if (bit_pos == NR_DENTRY_IN_BLOCK && + !truncate_hole(dir, page->index, page->index + 1)) { clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); -- cgit v1.2.3 From 31696580bf4c042a0f7b06d855e04441488d18b1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Jul 2015 18:33:46 +0800 Subject: f2fs: shrink free_nids entries This patch introduces __count_free_nids/try_to_free_nids and registers them in slab shrinker for shrinking under memory pressure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 28 ++++++++++++++++++++++++++++ fs/f2fs/segment.c | 3 +++ fs/f2fs/shrinker.c | 14 ++++++++++++++ 4 files changed, 46 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc07b1595a92..23bfc0ccaf10 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1681,6 +1681,7 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); +int try_to_free_nids(struct f2fs_sb_info *, int); void recover_inline_xattr(struct inode *, struct page *); void recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ac9110788b17..6e10c2a08ec6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1635,6 +1635,34 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next; + int nr = nr_shrink; + + if (!mutex_trylock(&nm_i->build_lock)) + return 0; + + spin_lock(&nm_i->free_nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + break; + if (i->state == NID_ALLOC) + continue; + __del_from_free_nid_list(nm_i, i); + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + nr_shrink--; + spin_lock(&nm_i->free_nid_list_lock); + } + spin_unlock(&nm_i->free_nid_list_lock); + mutex_unlock(&nm_i->build_lock); + + return nr - nr_shrink; +} + void recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bf1605dbce93..1b4265639f07 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -310,6 +310,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, NAT_ENTRIES)) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + if (!available_free_memory(sbi, FREE_NIDS)) + try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 9aa4235cd304..da0d8e0b55a5 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -23,6 +23,13 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; } +static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +{ + if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) + return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + return 0; +} + static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); @@ -53,6 +60,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* shrink clean nat cache entries */ count += __count_nat_entries(sbi); + /* count free nids cache entries */ + count += __count_free_nids(sbi); + spin_lock(&f2fs_list_lock); p = p->next; mutex_unlock(&sbi->umount_mutex); @@ -97,6 +107,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, if (freed < nr) freed += try_to_free_nats(sbi, nr - freed); + /* shrink free nids cache entries */ + if (freed < nr) + freed += try_to_free_nids(sbi, nr - freed); + spin_lock(&f2fs_list_lock); p = p->next; list_move_tail(&sbi->s_list, &f2fs_list); -- cgit v1.2.3 From 798c1b16d1a6171587ff46c74ede8092e66f72f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Aug 2015 21:59:49 -0700 Subject: f2fs: skip checkpoint if there is no dirty and prefree segments We should avoid needless checkpoints when there is no dirty and prefree segment. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fcb263af58b3..81de28d8326f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -792,7 +792,8 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno, i; + unsigned int segno = NULL_SEGNO; + unsigned int i; int gc_type = BG_GC; int nfree = 0; int ret = -1; @@ -811,10 +812,11 @@ gc_more: if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; - write_checkpoint(sbi, &cpc); + if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) + write_checkpoint(sbi, &cpc); } - if (!__get_victim(sbi, &segno, gc_type)) + if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; -- cgit v1.2.3 From a6db67f06fd9f6b1ddb11bcf4d7e8e8a86908d01 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 10 Aug 2015 15:01:12 -0700 Subject: f2fs: increase the number of max hard links This patch increases the number of maximum hard links for one file. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 23bfc0ccaf10..830848836da5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -321,7 +321,7 @@ enum { */ }; -#define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ -- cgit v1.2.3 From 740432f835608d11b5386321ab5aa8f61e07fb27 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Aug 2015 11:43:56 -0700 Subject: f2fs: handle failed bio allocation As the below comment of bio_alloc_bioset, f2fs can allocate multiple bios at the same time. So, we can't guarantee that bio is allocated all the time. " * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be * able to allocate a bio. This is due to the mempool guarantees. To make this * work, callers must never allocate more than 1 bio at a time from this pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. " Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +-- fs/f2fs/f2fs.h | 15 +++++++++++++++ fs/f2fs/segment.c | 15 ++++++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cad9ebe45692..726e58b76295 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -90,8 +90,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); + bio = f2fs_bio_alloc(npages); bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 830848836da5..00591f725744 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -1253,6 +1254,20 @@ retry: return entry; } +static inline struct bio *f2fs_bio_alloc(int npages) +{ + struct bio *bio; + + /* No failure on bio allocation */ +retry: + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) { + cond_resched(); + goto retry; + } + return bio; +} + static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1b4265639f07..6273e2cde93e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -330,10 +330,12 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio; struct flush_cmd *cmd, *next; int ret; + bio = f2fs_bio_alloc(0); + fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); @@ -365,8 +367,15 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + if (!test_opt(sbi, FLUSH_MERGE)) { + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; + } init_completion(&cmd.wait); -- cgit v1.2.3 From 26d5859974bb817f7615be90199a8e82e3f0a0ed Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Aug 2015 14:37:50 -0700 Subject: f2fs: avoid garbage collecting already moved node blocks If node blocks were already moved, we don't need to move them again. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 81de28d8326f..0a5d573e2574 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -396,14 +396,18 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, { bool initial = true; struct f2fs_summary *entry; + block_t start_addr; int off; + start_addr = START_BLOCK(sbi, segno); + next_step: entry = sum; for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; + struct node_info ni; /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) @@ -426,6 +430,12 @@ next_step: continue; } + get_node_info(sbi, nid, &ni); + if (ni.blk_addr != start_addr + off) { + f2fs_put_page(node_page, 1); + continue; + } + /* set page dirty and write it */ if (gc_type == FG_GC) { f2fs_wait_on_page_writeback(node_page, NODE); -- cgit v1.2.3 From 268344664603706b6f156548f9d7482665222f87 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Aug 2015 17:57:29 -0700 Subject: f2fs: reuse nids more aggressively If we can reuse nids as many as possible, we can mitigate producing obsolete node pages in the page cache. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6e10c2a08ec6..3cc32b8f8204 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -306,6 +306,10 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); + + /* in order to reuse the nid */ + if (nm_i->next_scan_nid > ni->nid) + nm_i->next_scan_nid = ni->nid; } /* change address */ -- cgit v1.2.3 From 2286c0205d1478d4bece6e733cbaf15535fba09d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 15 Aug 2015 21:51:05 -0700 Subject: f2fs: fix to cover lock_op for update_inode_page Previously, update_inode_page is not called under f2fs_lock_op. Instead we should call with f2fs_write_inode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 016ed3ba2ca4..7faafb5043e0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -206,8 +206,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) { - update_inode_page(inode); + if (!datasync) { + f2fs_write_inode(inode, NULL); goto go_write; } -- cgit v1.2.3 From 5ee5293c3290a8e710d75977418f954e62c3dfdf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 15 Aug 2015 22:06:08 -0700 Subject: f2fs: retry gc if one section is not successfully reclaimed If FG_GC failed to reclaim one section, let's retry with another section from the start, since we can get anoterh good candidate. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0a5d573e2574..782b8e72c094 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -391,7 +391,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static void gc_node_segment(struct f2fs_sb_info *sbi, +static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { bool initial = true; @@ -411,7 +411,7 @@ next_step: /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + return 0; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -461,13 +461,11 @@ next_step: }; sync_node_pages(sbi, 0, &wbc); - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) - goto next_step; + /* return 1 only if FG_GC succefully reclaimed one */ + if (get_valid_blocks(sbi, segno, 1) == 0) + return 1; } + return 0; } /* @@ -649,7 +647,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -672,7 +670,7 @@ next_step: /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + return 0; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -737,15 +735,11 @@ next_step: if (gc_type == FG_GC) { f2fs_submit_merged_bio(sbi, DATA, WRITE); - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) { - phase = 2; - goto next_step; - } + /* return 1 only if FG_GC succefully reclaimed one */ + if (get_valid_blocks(sbi, segno, 1) == 0) + return 1; } + return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -761,12 +755,13 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; + int nfree = 0; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); @@ -786,10 +781,11 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: - gc_node_segment(sbi, sum->entries, segno, gc_type); + nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); break; case SUM_TYPE_DATA: - gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type); + nfree = gc_data_segment(sbi, sum->entries, gc_list, + segno, gc_type); break; } blk_finish_plug(&plug); @@ -798,6 +794,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, stat_inc_call_count(sbi->stat_info); f2fs_put_page(sum_page, 0); + return nfree; } int f2fs_gc(struct f2fs_sb_info *sbi) @@ -836,13 +833,10 @@ gc_more: META_SSA); for (i = 0; i < sbi->segs_per_sec; i++) - do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); - if (gc_type == FG_GC) { + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - nfree++; - WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); - } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; -- cgit v1.2.3 From a21c20f0c812925085204fced932ac95f2a76bf0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 16 Aug 2015 12:38:15 -0700 Subject: f2fs: go out for insert_inode_locked failure We should not call unlock_new_inode when insert_inode_locked failed. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 97e97c41b979..a680bf38e4f0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -53,7 +53,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) { err = -EINVAL; nid_free = true; - goto out; + goto fail; } /* If the directory encrypted, then we should encrypt the inode. */ @@ -75,9 +75,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) mark_inode_dirty(inode); return inode; -out: - clear_nlink(inode); - unlock_new_inode(inode); fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); -- cgit v1.2.3 From 24928634f81b1592e83b37dcd89ed45c28f12feb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 16 Aug 2015 13:04:50 -0700 Subject: f2fs: check the node block address of newly allocated nid This patch adds a routine which checks the block address of newly allocated nid. If an nid has already allocated by other thread due to subtle data races, it will result in filesystem corruption. So, it needs to check whether its block address was already allocated or not in prior to nid allocation as the last chance. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3cc32b8f8204..6bef5a2788b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1573,6 +1573,8 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { + struct node_info ni; + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1583,6 +1585,13 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); + + /* check nid is allocated already */ + get_node_info(sbi, *nid, &ni); + if (ni.blk_addr != NULL_ADDR) { + alloc_nid_done(sbi, *nid); + goto retry; + } return true; } spin_unlock(&nm_i->free_nid_list_lock); -- cgit v1.2.3 From 217940d4f0c4ec4f0852f7046fa419d0edf65c17 Mon Sep 17 00:00:00 2001 From: Junesung Lee Date: Tue, 18 Aug 2015 22:42:15 +0900 Subject: f2fs: fix typo Fix typo. Signed-off-by: Junesung Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index c629762005bc..b0a9dc929f88 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -45,7 +45,7 @@ config F2FS_FS_POSIX_ACL default y help Posix Access Control Lists (ACLs) support permissions for users and - gourps beyond the owner/group/world scheme. + groups beyond the owner/group/world scheme. To learn more about Access Control Lists, visit the POSIX ACLs for Linux website . -- cgit v1.2.3 From f8b703da2c23f9bfda7299bd14e4f7201c2be3c8 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Tue, 18 Aug 2015 17:13:13 +0800 Subject: f2fs: fix to update cached_en of extent tree properly In f2fs_lookup_extent_tree, et->cached_en was read and updated with only read lock held, it could cause __lookup_extent_tree within return entirely wrong extent_node, if other thread update et->cached_en just before __lookup_extent_tree return. However, there are two things about this patch that need to be noticed: 1. It does no good to arrange the order of concurrent read/write, the result would still be random in such case. 2. It's built on this assumption: the mix up of reads and writes on a single pointer would not make the pointer partially wrong at any time. Please let me know if I'm wrong, thx. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 32fae8ad5b7e..cea581353bc2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -85,13 +85,13 @@ static struct extent_node *__lookup_extent_tree(struct extent_tree *et, unsigned int fofs) { struct rb_node *node = et->root.rb_node; - struct extent_node *en; + struct extent_node *en = et->cached_en; - if (et->cached_en) { - struct extent_info *cei = &et->cached_en->ei; + if (en) { + struct extent_info *cei = &en->ei; if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - return et->cached_en; + return en; } while (node) { -- cgit v1.2.3 From e2b4e2bc8865e03eecd49caa9713a2402a96bba9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Aug 2015 19:11:19 +0800 Subject: f2fs: fix incorrect mapping for bmap The test step is like below: 1. touch file 2. truncate -s $((1024*1024)) file 3. fallocate -o 0 -l $((1024*1024)) file 4. fibmap.f2fs file Our result of fibmap.f2fs showed below is not correct: file_pos start_blk end_blk blks 0 -937166132 -937166132 1 4096 -937166132 -937166132 1 8192 -937166132 -937166132 1 12288 -937166132 -937166132 1 16384 -937166132 -937166132 1 20480 -937166132 -937166132 1 ... 1040384 -937166132 -937166132 1 1044480 -937166132 -937166132 1 This is because f2fs_map_blocks will return with no error when meeting a hole or preallocated block, the caller __get_data_block will map the uninitialized variable value to bh->b_blocknr. Unfortunately generic_block_bmap will neither check the return value of get_data() nor check mapping info of buffer_head, result in returning the random block address. After fixing the issue, our result shows correctly: file_pos start_blk end_blk blks 0 0 0 256 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 48 ++++++++++++++++++++++++++++++++++++------------ fs/f2fs/f2fs.h | 6 ++++++ 2 files changed, 42 insertions(+), 12 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 726e58b76295..73713bbd4646 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -561,7 +561,7 @@ out: * c. give the block addresses to blockdev */ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, bool fiemap) + int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; @@ -595,8 +595,19 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && !fiemap) - goto put_out; + if (dn.data_blkaddr == NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) { + err = -ENOENT; + goto put_out; + } else if (flag == F2FS_GET_BLOCK_READ || + flag == F2FS_GET_BLOCK_DIO) { + goto put_out; + } + /* + * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), + * mark it as mapped and unwritten block. + */ + } if (dn.data_blkaddr != NULL_ADDR) { map->m_flags = F2FS_MAP_MAPPED; @@ -611,6 +622,8 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; map->m_pblk = dn.data_blkaddr; } else { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; goto put_out; } @@ -633,7 +646,9 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && !fiemap) + + if (dn.data_blkaddr == NEW_ADDR && + flag != F2FS_GET_BLOCK_FIEMAP) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -675,7 +690,7 @@ out: } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, bool fiemap) + struct buffer_head *bh, int create, int flag) { struct f2fs_map_blocks map; int ret; @@ -683,7 +698,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - ret = f2fs_map_blocks(inode, &map, create, fiemap); + ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; @@ -693,15 +708,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, int flag) +{ + return __get_data_block(inode, iblock, bh_result, create, flag); +} + +static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __get_data_block(inode, iblock, bh_result, create, false); + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_DIO); } -static int get_data_block_fiemap(struct inode *inode, sector_t iblock, +static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __get_data_block(inode, iblock, bh_result, create, true); + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_BMAP); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -745,7 +768,8 @@ next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; - ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0); + ret = get_data_block(inode, start_blk, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -1530,7 +1554,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) __allocate_data_blocks(inode, offset, count); - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block); + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); @@ -1618,7 +1642,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (err) return err; } - return generic_block_bmap(mapping, block, get_data_block); + return generic_block_bmap(mapping, block, get_data_block_bmap); } const struct address_space_operations f2fs_dblock_aops = { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 00591f725744..51dfa8fcc505 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -375,6 +375,12 @@ struct f2fs_map_blocks { unsigned int m_flags; }; +/* for flag in get_data_block */ +#define F2FS_GET_BLOCK_READ 0 +#define F2FS_GET_BLOCK_DIO 1 +#define F2FS_GET_BLOCK_FIEMAP 2 +#define F2FS_GET_BLOCK_BMAP 3 + /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ -- cgit v1.2.3 From 91c481fff92c705dd382f1f53c01e6b6b88507d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Aug 2015 19:12:20 +0800 Subject: f2fs: add largest/cached stat in extent cache This patch adds to stat the hit count of largest/cached node for showing in debugfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 9 +++++++-- fs/f2fs/extent_cache.c | 14 +++++++++----- fs/f2fs/f2fs.h | 8 +++++++- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index bc215fd6c402..1a1a4c67a9bf 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,6 +33,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ + si->hit_largest = atomic_read(&sbi->read_hit_largest); + si->hit_cached = atomic_read(&sbi->read_hit_cached); si->hit_ext = atomic_read(&sbi->read_hit_ext); si->total_ext = atomic_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; @@ -279,8 +281,9 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", - si->hit_ext, si->total_ext); + seq_printf(s, "\nExtent Hit Ratio: L1-1:%d L1-2:%d L2:%d / %d\n", + si->hit_largest, si->hit_cached, + si->hit_ext, si->total_ext); seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); @@ -371,6 +374,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->total_hit_ext, 0); atomic_set(&sbi->read_hit_ext, 0); + atomic_set(&sbi->read_hit_largest, 0); + atomic_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index cea581353bc2..5cf217faed1f 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -81,8 +81,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct extent_tree *et, - unsigned int fofs) +static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, unsigned int fofs) { struct rb_node *node = et->root.rb_node; struct extent_node *en = et->cached_en; @@ -90,8 +90,10 @@ static struct extent_node *__lookup_extent_tree(struct extent_tree *et, if (en) { struct extent_info *cei = &en->ei; - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { + stat_inc_cached_node_hit(sbi); return en; + } } while (node) { @@ -280,10 +282,11 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, *ei = et->largest; ret = true; stat_inc_read_hit(sbi); + stat_inc_largest_node_hit(sbi); goto out; } - en = __lookup_extent_tree(et, pgofs); + en = __lookup_extent_tree(sbi, et, pgofs); if (en) { *ei = en->ei; spin_lock(&sbi->extent_lock); @@ -313,7 +316,8 @@ out: * tree must stay unchanged between lookup and insertion. */ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, struct extent_node **prev_ex, + unsigned int fofs, + struct extent_node **prev_ex, struct extent_node **next_ex, struct rb_node ***insert_p, struct rb_node **insert_parent) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 51dfa8fcc505..de20387ae225 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,8 @@ struct f2fs_sb_info { atomic_t inplace_count; /* # of inplace update */ atomic_t total_hit_ext; /* # of lookup extent cache */ atomic_t read_hit_ext; /* # of hit extent cache */ + atomic_t read_hit_largest; /* # of hit largest extent node */ + atomic_t read_hit_cached; /* # of hit cached extent node */ atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -1824,7 +1826,7 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_ext, total_ext, ext_tree, ext_node; + int hit_largest, hit_cached, hit_ext, total_ext, ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; @@ -1862,6 +1864,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) #define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) #define stat_inc_read_hit(sbi) (atomic_inc(&(sbi)->read_hit_ext)) +#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -1942,6 +1946,8 @@ void f2fs_destroy_root_stats(void); #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) #define stat_inc_read_hit(sb) +#define stat_inc_largest_node_hit(sbi) +#define stat_inc_cached_node_hit(sbi) #define stat_inc_inline_xattr(inode) #define stat_dec_inline_xattr(inode) #define stat_inc_inline_inode(inode) -- cgit v1.2.3 From 029e13cc3221be4bc46909225142277fee52c37e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Aug 2015 19:13:25 +0800 Subject: f2fs: adjust showing of extent cache stat This patch alters to replace total hit stat with rbtree hit stat, and then adjust showing of extent cache stat: Hit Count: L1-1: for largest node hit count; L1-2: for last cached node hit count; L2: for extent node hit after lookuping in rbtree. Hit Ratio: ratio (hit count / total lookup count) Inner Struct Count: tree count, node count. Before: Extent Hit Ratio: 0 / 2 Extent Tree Count: 3 Extent Node Count: 2 Patched: Exten Cacache: - Hit Count: L1-1:4871 L1-2:2074 L2:208 - Hit Ratio: 1% (7153 / 550751) - Inner Struct Count: tree: 26560, node: 11824 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 18 ++++++++++++------ fs/f2fs/extent_cache.c | 10 +++++----- fs/f2fs/f2fs.h | 9 +++++---- 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 1a1a4c67a9bf..d013d8479753 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -35,7 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) /* validation check of the segment numbers */ si->hit_largest = atomic_read(&sbi->read_hit_largest); si->hit_cached = atomic_read(&sbi->read_hit_cached); - si->hit_ext = atomic_read(&sbi->read_hit_ext); + si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); @@ -281,11 +282,16 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "\nExtent Hit Ratio: L1-1:%d L1-2:%d L2:%d / %d\n", + seq_puts(s, "\nExtent Cache:\n"); + seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", si->hit_largest, si->hit_cached, - si->hit_ext, si->total_ext); - seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); - seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); + si->hit_rbtree); + seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + !si->total_ext ? 0 : + (si->hit_total * 100) / si->total_ext, + si->hit_total, si->total_ext); + seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", + si->ext_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -373,7 +379,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) sbi->stat_info = si; atomic_set(&sbi->total_hit_ext, 0); - atomic_set(&sbi->read_hit_ext, 0); + atomic_set(&sbi->read_hit_rbtree, 0); atomic_set(&sbi->read_hit_largest, 0); atomic_set(&sbi->read_hit_cached, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 5cf217faed1f..d11735aa3cac 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -99,12 +99,14 @@ static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, while (node) { en = rb_entry(node, struct extent_node, rb_node); - if (fofs < en->ei.fofs) + if (fofs < en->ei.fofs) { node = node->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) + } else if (fofs >= en->ei.fofs + en->ei.len) { node = node->rb_right; - else + } else { + stat_inc_rbtree_node_hit(sbi); return en; + } } return NULL; } @@ -281,7 +283,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; - stat_inc_read_hit(sbi); stat_inc_largest_node_hit(sbi); goto out; } @@ -295,7 +296,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, et->cached_en = en; spin_unlock(&sbi->extent_lock); ret = true; - stat_inc_read_hit(sbi); } out: stat_inc_total_hit(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index de20387ae225..66410178aba1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -788,7 +788,7 @@ struct f2fs_sb_info { unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ atomic_t total_hit_ext; /* # of lookup extent cache */ - atomic_t read_hit_ext; /* # of hit extent cache */ + atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ atomic_t read_hit_largest; /* # of hit largest extent node */ atomic_t read_hit_cached; /* # of hit cached extent node */ atomic_t inline_xattr; /* # of inline_xattr inodes */ @@ -1826,7 +1826,8 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_largest, hit_cached, hit_ext, total_ext, ext_tree, ext_node; + int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; + int ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; @@ -1863,7 +1864,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) #define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) -#define stat_inc_read_hit(sbi) (atomic_inc(&(sbi)->read_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) #define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) #define stat_inc_inline_xattr(inode) \ @@ -1945,7 +1946,7 @@ void f2fs_destroy_root_stats(void); #define stat_inc_dirty_dir(sbi) #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) -#define stat_inc_read_hit(sb) +#define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) #define stat_inc_cached_node_hit(sbi) #define stat_inc_inline_xattr(inode) -- cgit v1.2.3 From a6f7834594a284316b38d0885b2ee1ab47899dbc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Aug 2015 19:14:15 +0800 Subject: f2fs: kill dead code in __insert_extent_tree After commit 0f825ee6e873 ("f2fs: add new interfaces for extent tree"), f2fs_init_extent_tree becomes the only caller of __insert_extent_tree, and in f2fs_init_extent_tree, we will only insert extent node in an empty tree, so __try_{back,front}_merge in __insert_extent_tree will never be called. This patch removes these dead codes, besides, rename __insert_extent_tree to __init_extent_tree for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 82 ++++---------------------------------------------- 1 file changed, 6 insertions(+), 76 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index d11735aa3cac..5b6139f57841 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -111,87 +111,17 @@ static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, return NULL; } -static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *prev; - struct rb_node *node; - - node = rb_prev(&en->rb_node); - if (!node) - return NULL; - - prev = rb_entry(node, struct extent_node, rb_node); - if (__is_back_mergeable(&en->ei, &prev->ei)) { - en->ei.fofs = prev->ei.fofs; - en->ei.blk = prev->ei.blk; - en->ei.len += prev->ei.len; - __detach_extent_node(sbi, et, prev); - return prev; - } - return NULL; -} - -static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *next; - struct rb_node *node; - - node = rb_next(&en->rb_node); - if (!node) - return NULL; - - next = rb_entry(node, struct extent_node, rb_node); - if (__is_front_mergeable(&en->ei, &next->ei)) { - en->ei.len += next->ei.len; - __detach_extent_node(sbi, et, next); - return next; - } - return NULL; -} - -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct extent_node **den) +static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei) { struct rb_node **p = &et->root.rb_node; - struct rb_node *parent = NULL; struct extent_node *en; - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) { - if (__is_front_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.fofs = ei->fofs; - en->ei.blk = ei->blk; - en->ei.len += ei->len; - *den = __try_back_merge(sbi, et, en); - goto update_out; - } - p = &(*p)->rb_left; - } else if (ei->fofs >= en->ei.fofs + en->ei.len) { - if (__is_back_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.len += ei->len; - *den = __try_front_merge(sbi, et, en); - goto update_out; - } - p = &(*p)->rb_right; - } else { - f2fs_bug_on(sbi, 1); - } - } - - en = __attach_extent_node(sbi, et, ei, parent, p); + en = __attach_extent_node(sbi, et, ei, NULL, p); if (!en) return NULL; -update_out: - if (en->ei.len > et->largest.len) - et->largest = en->ei; + + et->largest = en->ei; et->cached_en = en; return en; } @@ -255,7 +185,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) if (et->count) goto out; - en = __insert_extent_tree(sbi, et, &ei, NULL); + en = __init_extent_tree(sbi, et, &ei); if (en) { spin_lock(&sbi->extent_lock); list_add_tail(&en->list, &sbi->extent_list); -- cgit v1.2.3 From ef05e221995057a8588cad675992ca2cb47e9891 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Aug 2015 19:15:09 +0800 Subject: f2fs: split __insert_extent_tree_ret for readability This patch splits __insert_extent_tree_ret into __try_merge_extent_node & __insert_extent_tree for code readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 5b6139f57841..ab26728736eb 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -294,29 +294,22 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, return NULL; } -static struct extent_node *__insert_extent_tree_ret(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct extent_node **den, struct extent_node *prev_ex, - struct extent_node *next_ex, - struct rb_node **insert_p, - struct rb_node *insert_parent) + struct extent_node *next_ex) { - struct rb_node **p = &et->root.rb_node; - struct rb_node *parent = NULL; struct extent_node *en = NULL; - int merged = 0; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { - f2fs_bug_on(sbi, !den); - merged = 1; prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } + if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - f2fs_bug_on(sbi, !den); - if (merged++) { + if (en) { __detach_extent_node(sbi, et, prev_ex); *den = prev_ex; } @@ -325,8 +318,23 @@ static struct extent_node *__insert_extent_tree_ret(struct f2fs_sb_info *sbi, next_ex->ei.len += ei->len; en = next_ex; } - if (merged) - goto update_out; + + if (en) { + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + } + return en; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; if (insert_p && insert_parent) { parent = insert_parent; @@ -349,7 +357,7 @@ do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) return NULL; -update_out: + if (en->ei.len > et->largest.len) et->largest = en->ei; et->cached_en = en; @@ -401,8 +409,7 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { set_extent_info(&ei, dei.fofs, dei.blk, fofs - dei.fofs); - en1 = __insert_extent_tree_ret(sbi, et, &ei, NULL, - NULL, NULL, NULL, NULL); + en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL); } /* insert right part of split extent into cache */ @@ -410,8 +417,7 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { set_extent_info(&ei, fofs + 1, fofs - dei.fofs + dei.blk + 1, endofs - fofs); - en2 = __insert_extent_tree_ret(sbi, et, &ei, NULL, - NULL, NULL, NULL, NULL); + en2 = __insert_extent_tree(sbi, et, &ei, NULL, NULL); } } @@ -419,8 +425,11 @@ update_extent: /* 3. update extent in extent cache */ if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, 1); - en3 = __insert_extent_tree_ret(sbi, et, &ei, &den, - prev_ex, next_ex, insert_p, insert_parent); + en3 = __try_merge_extent_node(sbi, et, &ei, &den, + prev_ex, next_ex); + if (!en3) + en3 = __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && -- cgit v1.2.3 From dac2ddefe62841efc0b6cdcb0bbf3e3594aa01bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Aug 2015 19:16:09 +0800 Subject: f2fs: lookup neighbor extent nodes for merging later In __lookup_extent_tree_ret we will not try to find neighbor nodes if we find the target node, in this condition, we will lost the chance to merge the new mapping with exist extent node later. So our extent cache of inode will be fragmented after overwrite exist file, we can see the number of extent node increases intensively in following test case: dd if=/dev/zero of=/mnt/f2fs/4m bs=4K count=1024 Extent Cache: - Hit Count: L1-1:0 L1-2:0 L2:0 - Hit Ratio: 0% (0 / 3072) - Inner Struct Count: tree: 1, node: 1 dd if=/dev/zero of=/mnt/f2fs/4m bs=4K count=1024 conv=notrunc Extent Cache: - Hit Count: L1-1:2048 L1-2:0 L2:0 - Hit Ratio: 33% (2048 / 6144) - Inner Struct Count: tree: 1, node: 961 This patch fixes to lookup neighbors of target node for further merging. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ab26728736eb..dcfeb43a5975 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -254,13 +254,21 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, { struct rb_node **pnode = &et->root.rb_node; struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en; + struct extent_node *en = et->cached_en; - if (et->cached_en) { - struct extent_info *cei = &et->cached_en->ei; + *insert_p = NULL; + *insert_parent = NULL; + *prev_ex = NULL; + *next_ex = NULL; + + if (RB_EMPTY_ROOT(&et->root)) + return NULL; + + if (en) { + struct extent_info *cei = &en->ei; if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - return et->cached_en; + goto lookup_neighbors; } while (*pnode) { @@ -272,7 +280,7 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, else if (fofs >= en->ei.fofs + en->ei.len) pnode = &(*pnode)->rb_right; else - return en; + goto lookup_neighbors; } *insert_p = pnode; @@ -290,8 +298,22 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, tmp_node = rb_prev(parent); *prev_ex = tmp_node ? rb_entry(tmp_node, struct extent_node, rb_node) : NULL; - return NULL; + +lookup_neighbors: + if (fofs == en->ei.fofs) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&en->rb_node); + *prev_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + } + if (fofs == en->ei.fofs + en->ei.len - 1) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&en->rb_node); + *next_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + } + return en; } static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, -- cgit v1.2.3 From 80c545055dc7c1f7f487176fe0aac17896a4b7af Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Aug 2015 08:51:56 -0700 Subject: f2fs: use __GFP_NOFAIL to avoid infinite loop __GFP_NOFAIL can avoid retrying the whole path of kmem_cache_alloc and bio_alloc. And, it also fixes the use cases of GFP_ATOMIC correctly. Suggested-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 21 ++++++++------------- fs/f2fs/f2fs.h | 16 +++++----------- fs/f2fs/node.c | 4 ++-- fs/f2fs/segment.c | 2 +- 4 files changed, 16 insertions(+), 27 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 890e4d4c39d7..c5a38e352a80 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -336,26 +336,18 @@ const struct address_space_operations f2fs_meta_aops = { static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e; + struct ino_entry *e, *tmp; + + tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); retry: - if (radix_tree_preload(GFP_NOFS)) { - cond_resched(); - goto retry; - } + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); - e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); - if (!e) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + e = tmp; if (radix_tree_insert(&im->ino_root, ino, e)) { spin_unlock(&im->ino_lock); - kmem_cache_free(ino_entry_slab, e); radix_tree_preload_end(); goto retry; } @@ -368,6 +360,9 @@ retry: } spin_unlock(&im->ino_lock); radix_tree_preload_end(); + + if (e != tmp) + kmem_cache_free(ino_entry_slab, tmp); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 66410178aba1..ece5e704dfd0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1252,13 +1252,10 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *entry; -retry: - entry = kmem_cache_alloc(cachep, flags); - if (!entry) { - cond_resched(); - goto retry; - } + entry = kmem_cache_alloc(cachep, flags); + if (!entry) + entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); return entry; } @@ -1267,12 +1264,9 @@ static inline struct bio *f2fs_bio_alloc(int npages) struct bio *bio; /* No failure on bio allocation */ -retry: bio = bio_alloc(GFP_NOIO, npages); - if (!bio) { - cond_resched(); - goto retry; - } + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); return bio; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6bef5a2788b4..777066d29fa8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -159,7 +159,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -246,7 +246,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6273e2cde93e..78e6d0696847 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1753,7 +1753,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); -- cgit v1.2.3 From f7409d0fae7a02ea6c8195f75ad73866d5dea617 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Aug 2015 23:37:18 -0700 Subject: f2fs: fix wrong pointer access during try_to_free_nids If we release the lock in list_for_each_entry_safe, we can lose the tmp pointer by alloc_nid. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 777066d29fa8..0867325e288f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1664,11 +1664,9 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) if (i->state == NID_ALLOC) continue; __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); + nm_i->fcnt--; nr_shrink--; - spin_lock(&nm_i->free_nid_list_lock); } spin_unlock(&nm_i->free_nid_list_lock); mutex_unlock(&nm_i->build_lock); -- cgit v1.2.3 From 6a6788576dac56135bf98ad974a038b0afb1a499 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Mon, 24 Aug 2015 10:41:32 +0800 Subject: f2fs: atomically set inode->i_flags According to commit 5f16f3225b06 ("ext4: atomically set inode->i_flags in ext4_set_inode_flags()"). Signed-off-by: Zhang Zhen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 83354433d4d1..d1b03d01b7e3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "f2fs.h" #include "node.h" @@ -34,8 +33,8 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; - set_mask_bits(&inode->i_flags, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) -- cgit v1.2.3 From 4ec17d688d74b6b7cb10043c57ff4818cde2b0ca Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Aug 2015 17:36:25 +0800 Subject: f2fs: avoid unneeded initializing when converting inline dentry When converting inline dentry, we will zero out target dentry page before duplicating data of inline dentry into target page, it become overhead since inline dentry size is not small. So this patch tries to remove unneeded initializing in the space of target dentry page. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 79d18d5c1fae..3d143be42895 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -384,13 +384,21 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); dentry_blk = kmap_atomic(page); /* copy data from inline dentry block to new dentry block */ memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, INLINE_DENTRY_BITMAP_SIZE); + memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, + SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + /* + * we do not need to zero out remainder part of dentry and filename + * field, since we have used bitmap for marking the usage status of + * them, besides, we can also ignore copying/zeroing reserved space + * of dentry block, because them haven't been used so far. + */ memcpy(dentry_blk->dentry, inline_dentry->dentry, sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); memcpy(dentry_blk->filename, inline_dentry->filename, -- cgit v1.2.3 From b01548919c33767bc457390fa3c41aedc273bfff Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Aug 2015 17:39:42 +0800 Subject: f2fs: handle f2fs_truncate error correctly This patch fixes to return error number of f2fs_truncate, so that we can handle the error correctly in callers. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ece5e704dfd0..806439f1c886 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1595,7 +1595,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -void f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *, bool); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7faafb5043e0..86a5c76eb106 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -579,24 +579,30 @@ out: return err; } -void f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode, bool lock) { + int err; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) - return; + return 0; trace_f2fs_truncate(inode); /* we should check inline_data size */ if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { - if (f2fs_convert_inline_inode(inode)) - return; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } - if (!truncate_blocks(inode, i_size_read(inode), lock)) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); - } + err = truncate_blocks(inode, i_size_read(inode), lock); + if (err) + return err; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return 0; } int f2fs_getattr(struct vfsmount *mnt, @@ -656,7 +662,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode, true); + err = f2fs_truncate(inode, true); + if (err) + return err; f2fs_balance_fs(F2FS_I_SB(inode)); } else { /* -- cgit v1.2.3 From 13ec7297e5331f2754d7629a068c619c41f20e56 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Aug 2015 17:40:45 +0800 Subject: f2fs: fix to release inode correctly In following call stack, if unfortunately we lose all chances to truncate inode page in remove_inode_page, eventually we will add the nid allocated previously into free nid cache, this nid is with NID_NEW status and with NEW_ADDR in its blkaddr pointer: - f2fs_create - f2fs_add_link - __f2fs_add_link - init_inode_metadata - new_inode_page - new_node_page - set_node_addr(, NEW_ADDR) - f2fs_init_acl failed - remove_inode_page failed - handle_failed_inode - remove_inode_page failed - iput - f2fs_evict_inode - remove_inode_page failed - alloc_nid_failed cache a nid with valid blkaddr: NEW_ADDR This may not only cause resource leak of previous inode, but also may cause incorrect use of the previous blkaddr which is located in NO.nid node entry when this nid is reused by others. This patch tries to add this inode to orphan list if we fail to truncate inode, so that we can obtain a second chance to release it in orphan recovery flow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inode.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/f2fs/node.c | 14 +++++++++----- 3 files changed, 56 insertions(+), 13 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 806439f1c886..69827ee8a0ee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1687,7 +1687,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -void remove_inode_page(struct inode *); +int remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d1b03d01b7e3..35aae65b3e5d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -317,6 +317,7 @@ void f2fs_evict_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); nid_t xnid = fi->i_xattr_nid; + int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) @@ -342,11 +343,13 @@ void f2fs_evict_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + err = f2fs_truncate(inode, true); - f2fs_lock_op(sbi); - remove_inode_page(inode); - f2fs_unlock_op(sbi); + if (!err) { + f2fs_lock_op(sbi); + err = remove_inode_page(inode); + f2fs_unlock_op(sbi); + } sb_end_intwrite(inode->i_sb); no_delete: @@ -362,9 +365,26 @@ no_delete: if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { - alloc_nid_failed(sbi, inode->i_ino); + if (err && err != -ENOENT) + alloc_nid_done(sbi, inode->i_ino); + else + alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(fi, FI_FREE_NID); } + + if (err && err != -ENOENT) { + if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { + /* + * get here because we failed to release resource + * of inode previously, reminder our user to run fsck + * for fixing. + */ + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "inode (ino:%lu) resource leak, run fsck " + "to fix this issue!", inode->i_ino); + } + } out_clear: #ifdef CONFIG_F2FS_FS_ENCRYPTION if (fi->i_crypt_info) @@ -377,6 +397,7 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err = 0; clear_nlink(inode); make_bad_inode(inode); @@ -384,9 +405,27 @@ void handle_failed_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, false); + err = f2fs_truncate(inode, false); + + if (!err) + err = remove_inode_page(inode); - remove_inode_page(inode); + /* + * if we skip truncate_node in remove_inode_page bacause we failed + * before, it's better to find another way to release resource of + * this inode (e.g. valid block count, node block or nid). Here we + * choose to add this inode to orphan list, so that we can call iput + * for releasing in orphan recovery flow. + * + * Note: we should add inode to orphan list before f2fs_unlock_op() + * so we can prevent losing this orphan when encoutering checkpoint + * and following suddenly power-off. + */ + if (err && err != -ENOENT) { + err = acquire_orphan_inode(sbi); + if (!err) + add_orphan_inode(sbi, inode->i_ino); + } set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0867325e288f..27d1a74dd6f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -902,17 +902,20 @@ int truncate_xattr_node(struct inode *inode, struct page *page) * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -void remove_inode_page(struct inode *inode) +int remove_inode_page(struct inode *inode) { struct dnode_of_data dn; + int err; set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - if (get_dnode_of_data(&dn, 0, LOOKUP_NODE)) - return; + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; - if (truncate_xattr_node(inode, dn.inode_page)) { + err = truncate_xattr_node(inode, dn.inode_page); + if (err) { f2fs_put_dnode(&dn); - return; + return err; } /* remove potential inline_data blocks */ @@ -926,6 +929,7 @@ void remove_inode_page(struct inode *inode) /* will put inode & node pages */ truncate_node(&dn); + return 0; } struct page *new_inode_page(struct inode *inode) -- cgit v1.2.3 From 19b2c30d3cce928010138cae4b9e57c388aa065c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Aug 2015 20:34:48 +0800 Subject: f2fs: update extent tree in batches This patch introduce a new helper f2fs_update_extent_tree_range which can do extent mapping update at a specified range. The main idea is: 1) punch all mapping info in extent node(s) which are at a specified range; 2) try to merge new extent mapping with adjacent node, or failing that, insert the mapping into extent tree as a new node. In order to see the benefit, I add a function for stating time stamping count as below: uint64_t rdtsc(void) { uint32_t lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return (uint64_t)hi << 32 | lo; } My test environment is: ubuntu, intel i7-3770, 16G memory, 256g micron ssd. truncation path: update extent cache from truncate_data_blocks_range non-truncataion path: update extent cache from other paths total: all update paths a) Removing 128MB file which has one extent node mapping whole range of file: 1. dd if=/dev/zero of=/mnt/f2fs/128M bs=1M count=128 2. sync 3. rm /mnt/f2fs/128M Before: total count average truncation: 7651022 32768 233.49 Patched: total count average truncation: 3321 33 100.64 b) fsstress: fsstress -d /mnt/f2fs -l 5 -n 100 -p 20 Test times: 5 times. Before: total count average truncation: 5812480.6 20911.6 277.95 non-truncation: 7783845.6 13440.8 579.12 total: 13596326.2 34352.4 395.79 Patched: total count average truncation: 1281283.0 3041.6 421.25 non-truncation: 7355844.4 13662.8 538.38 total: 8637127.4 16704.4 517.06 1) For the updates in truncation path: - we can see updating in batches leads total tsc and update count reducing explicitly; - besides, for a single batched updating, punching multiple extent nodes in a loop, result in executing more operations, so our average tsc increase intensively. 2) For the updates in non-truncation path: - there is a little improvement, that is because for the scenario that we just need to update in the head or tail of extent node, new interface optimize to update info in extent node directly, rather than removing original extent node for updating and then inserting that updated one into cache as new node. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 217 +++++++++++++++++++++++++++++++++++-------------- fs/f2fs/f2fs.h | 2 + fs/f2fs/file.c | 12 ++- 3 files changed, 170 insertions(+), 61 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index dcfeb43a5975..e6b245718ef0 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -386,23 +386,21 @@ do_insert: return en; } -/* return true, if on-disk extent should be updated */ -static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, - block_t blkaddr) +unsigned int f2fs_update_extent_tree_range(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; - struct extent_node *den = NULL, *prev_ex = NULL, *next_ex = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; - unsigned int endofs; + unsigned int end = fofs + len; + unsigned int pos = (unsigned int)fofs; if (!et) return false; - trace_f2fs_update_extent_tree(inode, fofs, blkaddr); - write_lock(&et->lock); if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { @@ -416,39 +414,143 @@ static bool f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, /* we do not guarantee that the largest extent is cached all the time */ f2fs_drop_largest_extent(inode, fofs); - /* 1. lookup and remove existing extent info in cache */ - en = __lookup_extent_tree_ret(et, fofs, &prev_ex, &next_ex, + /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ + en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, &insert_p, &insert_parent); - if (!en) - goto update_extent; - - dei = en->ei; - __detach_extent_node(sbi, et, en); - - /* 2. if extent can be split, try to split it */ - if (dei.len > F2FS_MIN_EXTENT_LEN) { - /* insert left part of split extent into cache */ - if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - fofs - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL); + if (!en) { + if (next_en) { + en = next_en; + f2fs_bug_on(sbi, en->ei.fofs <= pos); + pos = en->ei.fofs; + } else { + /* + * skip searching in the tree since there is no + * larger extent node in the cache. + */ + goto update_extent; + } + } + + /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + while (en) { + struct rb_node *node; + + if (pos >= end) + break; + + dei = en->ei; + en1 = en2 = NULL; + + node = rb_next(&en->rb_node); + + /* + * 2.1 there are four cases when we invalidate blkaddr in extent + * node, |V: valid address, X: will be invalidated| + */ + /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ + if (pos > dei.fofs && end >= dei.fofs + dei.len) { + en->ei.len = pos - dei.fofs; + + if (en->ei.len < F2FS_MIN_EXTENT_LEN) { + __detach_extent_node(sbi, et, en); + insert_p = NULL; + insert_parent = NULL; + goto update; + } + + if (__is_extent_same(&dei, &et->largest)) + et->largest = en->ei; + goto next; + } + + /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ + if (pos <= dei.fofs && end < dei.fofs + dei.len) { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + + if (en->ei.len < F2FS_MIN_EXTENT_LEN) { + __detach_extent_node(sbi, et, en); + insert_p = NULL; + insert_parent = NULL; + goto update; + } + + if (__is_extent_same(&dei, &et->largest)) + et->largest = en->ei; + goto next; } - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len - 1; - if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, fofs + 1, - fofs - dei.fofs + dei.blk + 1, endofs - fofs); - en2 = __insert_extent_tree(sbi, et, &ei, NULL, NULL); + __detach_extent_node(sbi, et, en); + + /* + * if we remove node in rb-tree, our parent node pointer may + * point the wrong place, discard them. + */ + insert_p = NULL; + insert_parent = NULL; + + /* case#3, invalidate entire extent node |XXXXXXXXXX| */ + if (pos <= dei.fofs && end >= dei.fofs + dei.len) { + if (__is_extent_same(&dei, &et->largest)) + et->largest.len = 0; + goto update; + } + + /* + * case#4, invalidate data in the middle of extent node + * |VVVXXXXVVV| + */ + if (dei.len > F2FS_MIN_EXTENT_LEN) { + unsigned int endofs; + + /* insert left part of split extent into cache */ + if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + pos - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len; + if (endofs - end >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + endofs - end); + en2 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + } } +update: + /* 2.2 update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + + /* 2.3 release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); +next: + en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; + next_en = en; + if (en) + pos = en->ei.fofs; } update_extent: /* 3. update extent in extent cache */ if (blkaddr) { - set_extent_info(&ei, fofs, blkaddr, 1); + struct extent_node *den = NULL; + + set_extent_info(&ei, fofs, blkaddr, len); en3 = __try_merge_extent_node(sbi, et, &ei, &den, - prev_ex, next_ex); + prev_en, next_en); if (!en3) en3 = __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent); @@ -460,36 +562,21 @@ update_extent: et->largest.len = 0; set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); } - } - /* 4. update in global extent list */ - spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) - list_del(&en->list); - /* - * en1 and en2 split from en, they will become more and more smaller - * fragments after splitting several times. So if the length is smaller - * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. - */ - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); - else - list_move_tail(&en3->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); + spin_lock(&sbi->extent_lock); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); - /* 5. release extent node */ - if (en) - kmem_cache_free(extent_node_slab, en); - if (den) - kmem_cache_free(extent_node_slab, den); + if (den) + kmem_cache_free(extent_node_slab, den); + } if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) __free_extent_tree(sbi, et, true); @@ -645,10 +732,22 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; - if (f2fs_update_extent_tree(dn->inode, fofs, dn->data_blkaddr)) + if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) + sync_inode_page(dn); +} + +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) + +{ + if (!f2fs_may_extent_tree(dn->inode)) + return; + + if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) sync_inode_page(dn); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 69827ee8a0ee..f1a90ffd7cad 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2017,6 +2017,8 @@ unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); void f2fs_update_extent_cache(struct dnode_of_data *); +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t, block_t, unsigned int); void init_extent_cache_info(struct f2fs_sb_info *); int __init create_extent_cache(void); void destroy_extent_cache(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 86a5c76eb106..8120f8685141 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -445,9 +445,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { - int nr_free = 0, ofs = dn->ofs_in_node; struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; + int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; raw_node = F2FS_NODE(dn->node_page); @@ -460,14 +460,22 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->data_blkaddr = NULL_ADDR; set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(F2FS_I(dn->inode), FI_FIRST_BLOCK_WRITTEN); nr_free++; } + if (nr_free) { + pgoff_t fofs; + /* + * once we invalidate valid blkaddr in range [ofs, ofs + count], + * we will invalidate all blkaddr in the whole range. + */ + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), + F2FS_I(dn->inode)) + ofs; + f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); set_page_dirty(dn->node_page); sync_inode_page(dn); -- cgit v1.2.3 From 54d71856428961124be26301b7997f2ad23be520 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Aug 2015 18:18:57 +0800 Subject: f2fs: avoid accessing NULL pointer in f2fs_drop_largest_extent If extent cache is disable, we will encounter oops when triggering direct IO as below: BUG: unable to handle kernel NULL pointer dereference at 0000000c IP: [] f2fs_drop_largest_extent+0xe/0x30 [f2fs] *pdpt = 000000002bb9a001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Modules linked in: f2fs(O) fuse bnep rfcomm bluetooth nfsd dm_crypt nfs_acl auth_rpcgss oid_registry nfs binfmt_misc fscache lockd sunrpc grace snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device snd soundcore joydev psmouse hid_generic i2c_piix4 serio_raw ppdev mac_hid parport_pc lp parport ext4 jbd2 mbcache usbhid hid e1000 CPU: 3 PID: 3608 Comm: dd Tainted: G O 4.2.0-rc4 #12 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 task: ef161600 ti: ebd5e000 task.ti: ebd5e000 EIP: 0060:[] EFLAGS: 00010202 CPU: 3 EIP is at f2fs_drop_largest_extent+0xe/0x30 [f2fs] EAX: 00000000 EBX: ddebc000 ECX: 00000000 EDX: 00000000 ESI: ebd5fdf8 EDI: 00000000 EBP: ebd5fd58 ESP: ebd5fd58 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 80050033 CR2: 0000000c CR3: 2c24ee40 CR4: 000006f0 Stack: ebd5fda4 f0b8c005 00000000 00000001 00000000 f0b8c430 c816cd68 ddebc000 ddebc088 00001000 00000555 00000555 ffffffff c160bb00 00055501 00000000 00000000 00000100 00000000 ebd5fe20 f0b8c430 00000046 ef161600 00001000 Call Trace: [] __allocate_data_block+0x1a5/0x260 [f2fs] [] ? f2fs_direct_IO+0x370/0x440 [f2fs] [] ? down_read+0x30/0x50 [] f2fs_direct_IO+0x370/0x440 [f2fs] [] generic_file_direct_write+0xa5/0x260 [] ? current_fs_time+0x18/0x50 [] __generic_file_write_iter+0xbb/0x210 [] ? generic_file_write_iter+0x2f/0x320 [] generic_file_write_iter+0x15c/0x320 [] f2fs_file_write_iter+0x39/0x80 [f2fs] [] __vfs_write+0xa9/0xe0 [] vfs_write+0x97/0x180 [] SyS_write+0x5b/0xd0 [] sysenter_do_call+0x12/0x12 Code: 10 8b 50 1c 89 53 14 eb ca 8d 74 26 00 85 f6 74 86 eb a6 0f 0b 90 8d b4 26 00 00 00 00 55 89 e5 3e 8d 74 26 00 8b 80 d4 02 00 00 <8b> 48 0c 39 d1 77 0e 03 48 14 39 ca 73 07 c7 40 14 00 00 00 00 EIP: [] f2fs_drop_largest_extent+0xe/0x30 [f2fs] SS:ESP 0068:ebd5fd58 CR2: 000000000000000c ---[ end trace a38c07026a1afffd ]--- This is because when extent cache is disable, extent_tree pointer in struct f2fs_inode_info should be NULL, but in f2fs_drop_largest_extent we access this NULL pointer directly without checking state of extent cache, then, the oops occurs. Let's fix it by checking state of extent cache before accessing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e6b245718ef0..997ac86f2a1d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -155,7 +155,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - et->count; } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) +static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; @@ -163,6 +163,14 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) largest->len = 0; } +void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + if (!f2fs_may_extent_tree(inode)) + return; + + __drop_largest_extent(inode, fofs); +} + void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -412,7 +420,7 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, dei.len = 0; /* we do not guarantee that the largest extent is cached all the time */ - f2fs_drop_largest_extent(inode, fofs); + __drop_largest_extent(inode, fofs); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, -- cgit v1.2.3 From 01a5ad827a36e36f45e1fdb96903ea115f759865 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 31 Aug 2015 17:15:10 +0800 Subject: f2fs: upset segment_info repair upset segment_info like this: 276000|161 0|0 4|70 3|0 3|0 0|0 0|91 4|0 4|232 4|39 276104|0 4|0 4|1 4|0 4|0 4|280 4|0 4|42 4|262 4|38 276204|179 4|89 4|39 4|24 4|0 4|96 4|3 4|428 4|0 4|118 276304|112 4|97 4|0 4|0 4|0 4|68 4|0 4|0 4|86 4|138 276404|0 4|0 0|166 5|39 4|101 0|111 Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cfe3f9579934..f79478115d37 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -693,7 +693,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) struct seg_entry *se = get_seg_entry(sbi, i); if ((i % 10) == 0) - seq_printf(seq, "%-5d", i); + seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, get_valid_blocks(sbi, i, 1)); if ((i % 10) == 9 || i == (total_segs - 1)) -- cgit v1.2.3