summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c46
-rw-r--r--fs/ext4/ext4.h134
-rw-r--r--fs/ext4/extents.c49
-rw-r--r--fs/ext4/extents_status.c237
-rw-r--r--fs/ext4/extents_status.h14
-rw-r--r--fs/ext4/file.c110
-rw-r--r--fs/ext4/fsync.c40
-rw-r--r--fs/ext4/hash.c6
-rw-r--r--fs/ext4/ialloc.c12
-rw-r--r--fs/ext4/indirect.c8
-rw-r--r--fs/ext4/inline.c56
-rw-r--r--fs/ext4/inode.c172
-rw-r--r--fs/ext4/ioctl.c29
-rw-r--r--fs/ext4/mballoc.c730
-rw-r--r--fs/ext4/mballoc.h16
-rw-r--r--fs/ext4/migrate.c11
-rw-r--r--fs/ext4/mmp.c30
-rw-r--r--fs/ext4/namei.c70
-rw-r--r--fs/ext4/readpage.c2
-rw-r--r--fs/ext4/super.c168
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/ext4/xattr.c52
22 files changed, 1246 insertions, 748 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 094269488183..1f72f977c6db 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -305,6 +305,36 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
return desc;
}
+static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_grpblk_t next_zero_bit;
+ unsigned long bitmap_size = sb->s_blocksize * 8;
+ unsigned int offset = num_clusters_in_group(sb, block_group);
+
+ if (bitmap_size <= offset)
+ return 0;
+
+ next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);
+
+ return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
+}
+
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info **grp_info;
+ long indexv, indexh;
+
+ if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
+ return NULL;
+ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
+}
+
/*
* Return the block number which was discovered to be invalid, or 0 if
* the block bitmap is valid.
@@ -379,7 +409,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
if (buffer_verified(bh))
return 0;
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
@@ -402,6 +432,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSCORRUPTED;
}
+ blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
+ block_group, blk);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ return -EFSCORRUPTED;
+ }
set_buffer_verified(bh);
verified:
ext4_unlock_group(sb, block_group);
@@ -845,7 +884,10 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
if (!ext4_bg_has_super(sb, group))
return 0;
- return EXT4_SB(sb)->s_gdb_count;
+ if (ext4_has_feature_meta_bg(sb))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 18cb2680dc39..0a2d55faa095 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -128,6 +128,58 @@ enum SHIFT_DIRECTION {
};
/*
+ * For each criteria, mballoc has slightly different way of finding
+ * the required blocks nad usually, higher the criteria the slower the
+ * allocation. We start at lower criterias and keep falling back to
+ * higher ones if we are not able to find any blocks. Lower (earlier)
+ * criteria are faster.
+ */
+enum criteria {
+ /*
+ * Used when number of blocks needed is a power of 2. This
+ * doesn't trigger any disk IO except prefetch and is the
+ * fastest criteria.
+ */
+ CR_POWER2_ALIGNED,
+
+ /*
+ * Tries to lookup in-memory data structures to find the most
+ * suitable group that satisfies goal request. No disk IO
+ * except block prefetch.
+ */
+ CR_GOAL_LEN_FAST,
+
+ /*
+ * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
+ * length to the best available length for faster allocation.
+ */
+ CR_BEST_AVAIL_LEN,
+
+ /*
+ * Reads each block group sequentially, performing disk IO if
+ * necessary, to find find_suitable block group. Tries to
+ * allocate goal length but might trim the request if nothing
+ * is found after enough tries.
+ */
+ CR_GOAL_LEN_SLOW,
+
+ /*
+ * Finds the first free set of blocks and allocates
+ * those. This is only used in rare cases when
+ * CR_GOAL_LEN_SLOW also fails to allocate anything.
+ */
+ CR_ANY_FREE,
+
+ /*
+ * Number of criterias defined.
+ */
+ EXT4_MB_NUM_CRS
+};
+
+/* criteria below which we use fast block scanning and avoid unnecessary IO */
+#define CR_FAST CR_GOAL_LEN_SLOW
+
+/*
* Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
@@ -165,9 +217,12 @@ enum SHIFT_DIRECTION {
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
/* Large fragment size list lookup succeeded at least once for cr = 0 */
-#define EXT4_MB_CR0_OPTIMIZED 0x8000
+#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
-#define EXT4_MB_CR1_OPTIMIZED 0x00010000
+#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
+/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */
+#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
@@ -918,11 +973,13 @@ do { \
* where the second inode has larger inode number
* than the first
* I_DATA_SEM_QUOTA - Used for quota inodes only
+ * I_DATA_SEM_EA - Used for ea_inodes only
*/
enum {
I_DATA_SEM_NORMAL = 0,
I_DATA_SEM_OTHER,
I_DATA_SEM_QUOTA,
+ I_DATA_SEM_EA
};
@@ -1530,21 +1587,25 @@ struct ext4_sb_info {
unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
+ unsigned int s_mb_best_avail_max_trim_order;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
atomic_t s_bal_allocated; /* in blocks */
atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- atomic_t s_bal_cr0_bad_suggestions;
- atomic_t s_bal_cr1_bad_suggestions;
- atomic64_t s_bal_cX_groups_considered[4];
- atomic64_t s_bal_cX_hits[4];
- atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
+ atomic_t s_bal_p2_aligned_bad_suggestions;
+ atomic_t s_bal_goal_fast_bad_suggestions;
+ atomic_t s_bal_best_avail_bad_suggestions;
+ atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
+ atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
+ atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
atomic_t s_mb_buddies_generated; /* number of buddies generated */
atomic64_t s_mb_generation_time;
atomic_t s_mb_lost_chunks;
@@ -1684,6 +1745,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
return container_of(inode, struct ext4_inode_info, vfs_inode);
}
+static inline int ext4_writepages_down_read(struct super_block *sb)
+{
+ percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ return memalloc_nofs_save();
+}
+
+static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
+{
+ memalloc_nofs_restore(ctx);
+ percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+}
+
+static inline int ext4_writepages_down_write(struct super_block *sb)
+{
+ percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
+ return memalloc_nofs_save();
+}
+
+static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
+{
+ memalloc_nofs_restore(ctx);
+ percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
+}
+
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -2606,10 +2691,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block);
-extern unsigned int ext4_block_group(struct super_block *sb,
- ext4_fsblk_t blocknr);
-extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
- ext4_fsblk_t blocknr);
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
@@ -2625,6 +2706,8 @@ extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
+extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
@@ -2813,8 +2896,6 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
-extern long ext4_mb_stats;
-extern long ext4_mb_max_to_scan;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
@@ -2875,7 +2956,8 @@ typedef enum {
EXT4_IGET_NORMAL = 0,
EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */
- EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */
+ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */
+ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */
} ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -2939,6 +3021,7 @@ int ext4_fileattr_set(struct mnt_idmap *idmap,
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
+int ext4_force_shutdown(struct super_block *sb, u32 flags);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
@@ -3232,19 +3315,6 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}
-static inline
-struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info **grp_info;
- long indexv, indexh;
- BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
- indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
- indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
- grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
- return grp_info[indexh];
-}
-
/*
* Reading s_groups_count requires using smp_rmb() afterwards. See
* the locking protocol documented in the comments of ext4_group_add()
@@ -3464,14 +3534,8 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
- loff_t pos, unsigned len,
- unsigned copied,
- struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page);
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 35703dce23a3..e4115d338f10 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3123,7 +3123,7 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
-static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
{
ext4_lblk_t ee_block;
ext4_fsblk_t ee_pblock;
@@ -3134,10 +3134,10 @@ static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
ee_pblock = ext4_ext_pblock(ex);
if (ee_len == 0)
- return 0;
+ return;
- return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
+ ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -3287,7 +3287,7 @@ static int ext4_split_extent_at(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (!err)
/* update extent status tree */
- err = ext4_zeroout_es(inode, &zero_ex);
+ ext4_zeroout_es(inode, &zero_ex);
/* If we failed at this point, we don't know in which
* state the extent tree exactly is so don't try to fix
* length of the original extent as it may do even more
@@ -3640,9 +3640,8 @@ fallback:
out:
/* If we have gotten a failure, don't zero out status tree */
if (!err) {
- err = ext4_zeroout_es(inode, &zero_ex1);
- if (!err)
- err = ext4_zeroout_es(inode, &zero_ex2);
+ ext4_zeroout_es(inode, &zero_ex1);
+ ext4_zeroout_es(inode, &zero_ex2);
}
return err ? err : allocated;
}
@@ -4403,15 +4402,8 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
-retry:
- err = ext4_es_remove_extent(inode, last_block,
- EXT_MAX_BLOCKS - last_block);
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_ATOMIC);
- goto retry;
- }
- if (err)
- return err;
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
retry_remove_space:
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
if (err == -ENOMEM) {
@@ -5363,13 +5355,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
-
- ret = ext4_es_remove_extent(inode, punch_start,
- EXT_MAX_BLOCKS - punch_start);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
if (ret) {
@@ -5547,12 +5533,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ext4_free_ext_path(path);
}
- ret = ext4_es_remove_extent(inode, offset_lblk,
- EXT_MAX_BLOCKS - offset_lblk);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
/*
* if offset_lblk lies in a hole which is at start of file, use
@@ -5610,12 +5591,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!inode_is_locked(inode1));
BUG_ON(!inode_is_locked(inode2));
- *erp = ext4_es_remove_extent(inode1, lblk1, count);
- if (unlikely(*erp))
- return 0;
- *erp = ext4_es_remove_extent(inode2, lblk2, count);
- if (unlikely(*erp))
- return 0;
+ ext4_es_remove_extent(inode1, lblk1, count);
+ ext4_es_remove_extent(inode2, lblk2, count);
while (count) {
struct ext4_extent *ex1, *ex2, tmp_ex;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7bc221038c6c..9b5b8951afb4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -144,9 +144,11 @@
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved);
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
@@ -267,14 +269,12 @@ static void __es_find_extent_range(struct inode *inode,
/* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u) %llu %x\n",
- lblk, es1->es_lblk, es1->es_len,
- ext4_es_pblock(es1), ext4_es_status(es1));
- goto out;
- }
+ es1 = READ_ONCE(tree->cache_es);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %x\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
+ goto out;
}
es1 = __es_tree_search(&tree->root, lblk);
@@ -293,7 +293,7 @@ out:
}
if (es1 && matching_fn(es1)) {
- tree->cache_es = es1;
+ WRITE_ONCE(tree->cache_es, es1);
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
@@ -448,22 +448,36 @@ static void ext4_es_list_del(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
}
-static struct extent_status *
-ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
- ext4_fsblk_t pblk)
+/*
+ * Returns true if we cannot fail to allocate memory for this extent_status
+ * entry and cannot reclaim it until its status changes.
+ */
+static inline bool ext4_es_must_keep(struct extent_status *es)
+{
+ /* fiemap, bigalloc, and seek_data/hole need to use it. */
+ if (ext4_es_is_delayed(es))
+ return true;
+
+ return false;
+}
+
+static inline struct extent_status *__es_alloc_extent(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
+ ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
- struct extent_status *es;
- es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
- if (es == NULL)
- return NULL;
es->es_lblk = lblk;
es->es_len = len;
es->es_pblk = pblk;
- /*
- * We don't count delayed extent because we never try to reclaim them
- */
- if (!ext4_es_is_delayed(es)) {
+ /* We never try to reclaim a must kept extent, so we don't count it. */
+ if (!ext4_es_must_keep(es)) {
if (!EXT4_I(inode)->i_es_shk_nr++)
ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
@@ -472,8 +486,11 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
EXT4_I(inode)->i_es_all_nr++;
percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+}
- return es;
+static inline void __es_free_extent(struct extent_status *es)
+{
+ kmem_cache_free(ext4_es_cachep, es);
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
@@ -481,8 +498,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
- /* Decrease the shrink counter when this es is not delayed */
- if (!ext4_es_is_delayed(es)) {
+ /* Decrease the shrink counter when we can reclaim the extent. */
+ if (!ext4_es_must_keep(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
if (!--EXT4_I(inode)->i_es_shk_nr)
ext4_es_list_del(inode);
@@ -490,7 +507,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
s_es_stats.es_stats_shk_cnt);
}
- kmem_cache_free(ext4_es_cachep, es);
+ __es_free_extent(es);
}
/*
@@ -751,7 +768,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode,
}
#endif
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
@@ -791,10 +809,15 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
}
}
- es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
- newes->es_pblk);
+ if (prealloc)
+ es = prealloc;
+ else
+ es = __es_alloc_extent(false);
if (!es)
return -ENOMEM;
+ ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
+
rb_link_node(&es->rb_node, parent, p);
rb_insert_color(&es->rb_node, &tree->root);
@@ -806,26 +829,27 @@ out:
/*
* ext4_es_insert_extent() adds information to an inode's extent
* status tree.
- *
- * Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status)
+void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
- int err = 0;
+ int err1 = 0;
+ int err2 = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
- return 0;
+ return;
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
if (!len)
- return 0;
+ return;
BUG_ON(end < lblk);
@@ -844,29 +868,40 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, NULL);
- if (err != 0)
+
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ if (err1 != 0)
+ goto error;
+
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
+ err2 = 0;
+ if (err2 != 0)
goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
- err = 0;
if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
(status & EXTENT_STATUS_WRITTEN ||
status & EXTENT_STATUS_UNWRITTEN))
__revise_pending(inode, lblk, len);
+ /* es is pre-allocated but not used, free it. */
+ if (es1 && !es1->es_len)
+ __es_free_extent(es1);
+ if (es2 && !es2->es_len)
+ __es_free_extent(es2);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2)
+ goto retry;
ext4_es_print_tree(inode);
-
- return err;
+ return;
}
/*
@@ -899,7 +934,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (!es || es->es_lblk > end)
- __es_insert_extent(inode, &newes);
+ __es_insert_extent(inode, &newes, NULL);
write_unlock(&EXT4_I(inode)->i_es_lock);
}
@@ -931,14 +966,12 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
/* find extent in cache firstly */
es->es_lblk = es->es_len = es->es_pblk = 0;
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u)\n",
- lblk, es1->es_lblk, es1->es_len);
- found = 1;
- goto out;
- }
+ es1 = READ_ONCE(tree->cache_es);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
}
node = tree->root.rb_node;
@@ -1291,6 +1324,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* @lblk - first block in range
* @end - last block in range
* @reserved - number of cluster reservations released
+ * @prealloc - pre-allocated es to avoid memory allocation failures
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
@@ -1298,7 +1332,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved)
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -1306,14 +1341,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err;
+ int err = 0;
bool count_reserved = true;
struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
-retry:
- err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
@@ -1347,14 +1380,13 @@ retry:
orig_es.es_len - len2;
ext4_es_store_pblock_status(&newes, block,
ext4_es_status(&orig_es));
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, prealloc);
if (err) {
+ if (!ext4_es_must_keep(&newes))
+ return 0;
+
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
- if ((err == -ENOMEM) &&
- __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
goto out;
}
} else {
@@ -1426,39 +1458,48 @@ out:
* @len - number of blocks to remove
*
* Reduces block/cluster reservation count and for bigalloc cancels pending
- * reservations as needed. Returns 0 on success, error code on failure.
+ * reservations as needed.
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
ext4_lblk_t end;
int err = 0;
int reserved = 0;
+ struct extent_status *es = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
- return 0;
+ return;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
if (!len)
- return err;
+ return;
end = lblk + len - 1;
BUG_ON(end < lblk);
+retry:
+ if (err && !es)
+ es = __es_alloc_extent(true);
/*
* ext4_clear_inode() depends on us taking i_es_lock unconditionally
* so that we are sure __es_shrink() is done with the inode before it
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved);
+ err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ if (es && !es->es_len)
+ __es_free_extent(es);
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err)
+ goto retry;
+
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return err;
+ return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -1706,11 +1747,8 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
(*nr_to_scan)--;
node = rb_next(&es->rb_node);
- /*
- * We can't reclaim delayed extent from status tree because
- * fiemap, bigallic, and seek_data/hole need to use it.
- */
- if (ext4_es_is_delayed(es))
+
+ if (ext4_es_must_keep(es))
goto next;
if (ext4_es_is_referenced(es)) {
ext4_es_clear_referenced(es);
@@ -1774,7 +1812,7 @@ void ext4_clear_inode_es(struct inode *inode)
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
node = rb_next(node);
- if (!ext4_es_is_delayed(es)) {
+ if (!ext4_es_must_keep(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
}
@@ -1976,17 +2014,18 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
* @lblk - logical block to be added
* @allocated - indicates whether a physical cluster has been allocated for
* the logical cluster that contains the block
- *
- * Returns 0 on success, negative error code on failure.
*/
-int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated)
{
struct extent_status newes;
- int err = 0;
+ int err1 = 0;
+ int err2 = 0;
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
- return 0;
+ return;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
@@ -1998,29 +2037,37 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, lblk, NULL);
- if (err != 0)
+ err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ if (err1 != 0)
goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err != 0)
+
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 != 0)
goto error;
if (allocated)
__insert_pending(inode, lblk);
+ /* es is pre-allocated but not used, free it. */
+ if (es1 && !es1->es_len)
+ __es_free_extent(es1);
+ if (es2 && !es2->es_len)
+ __es_free_extent(es2);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2)
+ goto retry;
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
-
- return err;
+ return;
}
/*
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4ec30a798260..d9847a4a25db 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -127,14 +127,14 @@ extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status);
+extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
@@ -249,8 +249,8 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
+extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d101b3b0c7da..c457c8517f0f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -147,6 +147,17 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
+static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
@@ -285,18 +296,13 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- current->backing_dev_info = inode_to_bdi(inode);
ret = generic_perform_write(iocb, from);
- current->backing_dev_info = NULL;
out:
inode_unlock(inode);
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
- ret = generic_write_sync(iocb, ret);
- }
-
- return ret;
+ if (unlikely(ret <= 0))
+ return ret;
+ return generic_write_sync(iocb, ret);
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
@@ -444,13 +450,14 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
bool *ilock_shared, bool *extend,
- bool *unwritten)
+ bool *unwritten, int *dio_flags)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
loff_t offset;
size_t count;
ssize_t ret;
+ bool overwrite, unaligned_io;
restart:
ret = ext4_generic_write_checks(iocb, from);
@@ -459,16 +466,20 @@ restart:
offset = iocb->ki_pos;
count = ret;
- if (ext4_extending_io(inode, offset, count))
- *extend = true;
+
+ unaligned_io = ext4_unaligned_io(inode, from, offset);
+ *extend = ext4_extending_io(inode, offset, count);
+ overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
+
/*
- * Determine whether the IO operation will overwrite allocated
- * and initialized blocks.
- * We need exclusive i_rwsem for changing security info
- * in file_modified().
+ * Determine whether we need to upgrade to an exclusive lock. This is
+ * required to change security info in file_modified(), for extending
+ * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
+ * extents (as partial block zeroing may be required).
*/
- if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
- !ext4_overwrite_io(inode, offset, count, unwritten))) {
+ if (*ilock_shared &&
+ ((!IS_NOSEC(inode) || *extend || !overwrite ||
+ (unaligned_io && *unwritten)))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -479,6 +490,32 @@ restart:
goto restart;
}
+ /*
+ * Now that locking is settled, determine dio flags and exclusivity
+ * requirements. Unaligned writes are allowed under shared lock so long
+ * as they are pure overwrites. Set the iomap overwrite only flag as an
+ * added precaution in this case. Even though this is unnecessary, we
+ * can detect and warn on unexpected -EAGAIN if an unsafe unaligned
+ * write is ever submitted.
+ *
+ * Otherwise, concurrent unaligned writes risk data corruption due to
+ * partial block zeroing in the dio layer, and so the I/O must occur
+ * exclusively. The inode lock is already held exclusive if the write is
+ * non-overwrite or extending, so drain all outstanding dio and set the
+ * force wait dio flag.
+ */
+ if (*ilock_shared && unaligned_io) {
+ *dio_flags = IOMAP_DIO_OVERWRITE_ONLY;
+ } else if (!*ilock_shared && (unaligned_io || *extend)) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (unaligned_io && (!overwrite || *unwritten))
+ inode_dio_wait(inode);
+ *dio_flags = IOMAP_DIO_FORCE_WAIT;
+ }
+
ret = file_modified(file);
if (ret < 0)
goto out;
@@ -500,18 +537,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unaligned_io = false, unwritten = false;
+ bool extend = false, unwritten = false;
bool ilock_shared = true;
+ int dio_flags = 0;
/*
- * We initially start with shared inode lock unless it is
- * unaligned IO which needs exclusive lock anyways.
- */
- if (ext4_unaligned_io(inode, from, offset)) {
- unaligned_io = true;
- ilock_shared = false;
- }
- /*
* Quick check here without any i_rwsem lock to see if it is extending
* IO. A more reliable check is done in ext4_dio_write_checks() with
* proper locking in place.
@@ -543,16 +573,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_buffered_write_iter(iocb, from);
}
- ret = ext4_dio_write_checks(iocb, from,
- &ilock_shared, &extend, &unwritten);
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
+ &unwritten, &dio_flags);
if (ret <= 0)
return ret;
- /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
- if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
- ret = -EAGAIN;
- goto out;
- }
/*
* Make sure inline data cannot be created anymore since we are going
* to allocate blocks for DIO. We know the inode does not have any
@@ -563,19 +588,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
offset = iocb->ki_pos;
count = ret;
- /*
- * Unaligned direct IO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned IOs can result in data
- * corruption.
- *
- * So we make sure we don't allow any unaligned IO in flight.
- * For IOs where we need not wait (like unaligned non-AIO DIO),
- * below inode_dio_wait() may anyway become a no-op, since we start
- * with exclusive lock.
- */
- if (unaligned_io)
- inode_dio_wait(inode);
-
if (extend) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
@@ -595,8 +607,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
- NULL, 0);
+ dio_flags, NULL, 0);
+ WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT));
if (ret == -ENOTBLK)
ret = 0;
@@ -957,7 +969,7 @@ const struct file_operations ext4_file_operations = {
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
- .splice_read = generic_file_splice_read,
+ .splice_read = ext4_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index f65fdb27ce14..0c56f3a011a1 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -78,21 +79,13 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
-static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
- bool *needs_barrier)
+static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
+ int datasync, bool *needs_barrier)
{
- int ret, err;
-
- ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
- return ret;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return ret;
-
- err = sync_inode_metadata(inode, 1);
- if (!ret)
- ret = err;
+ struct inode *inode = file->f_inode;
+ int ret;
+ ret = generic_buffers_fsync_noflush(file, start, end, datasync);
if (!ret)
ret = ext4_sync_parent(inode);
if (test_opt(inode->i_sb, BARRIER))
@@ -108,6 +101,13 @@ static int ext4_fsync_journal(struct inode *inode, bool datasync,
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ /*
+ * Fastcommit does not really support fsync on directories or other
+ * special files. Force a full commit.
+ */
+ if (!S_ISREG(inode->i_mode))
+ return ext4_force_commit(inode->i_sb);
+
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
*needs_barrier = true;
@@ -148,6 +148,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
+ if (!sbi->s_journal) {
+ ret = ext4_fsync_nojournal(file, start, end, datasync,
+ &needs_barrier);
+ if (needs_barrier)
+ goto issue_flush;
+ goto out;
+ }
+
ret = file_write_and_wait_range(file, start, end);
if (ret)
goto out;
@@ -157,11 +165,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* Metadata is in the journal, we wait for proper transaction to
* commit here.
*/
- if (!sbi->s_journal)
- ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
- else
- ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
+ ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
+issue_flush:
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!ret)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 147b5241dd94..46c3423ddfa1 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -277,7 +277,11 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
}
default:
hinfo->hash = 0;
- return -1;
+ hinfo->minor_hash = 0;
+ ext4_warning(dir->i_sb,
+ "invalid/unsupported hash tree version %u",
+ hinfo->hash_version);
+ return -EINVAL;
}
hash = hash & ~1;
if (hash == (EXT4_HTREE_EOF_32BIT << 1))
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 787ab89c2c26..754f961cd9fd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -91,7 +91,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (buffer_verified(bh))
return 0;
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
@@ -293,7 +293,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
}
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, block_group);
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
fatal = -EFSCORRUPTED;
goto error_return;
}
@@ -1046,7 +1046,7 @@ got_group:
* Skip groups with already-known suspicious inode
* tables
*/
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
}
@@ -1183,6 +1183,10 @@ got:
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, group);
+ if (!grp) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
down_read(&grp->alloc_sem); /*
* protect vs itable
* lazyinit
@@ -1526,7 +1530,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
}
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
- if (!gdp)
+ if (!gdp || !grp)
goto out;
/*
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index c68bebe7ff4b..a9f3716119d3 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -651,6 +651,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
+
+ /*
+ * Update reserved blocks/metadata blocks after successful block
+ * allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, count, 1);
+
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 859bc4e2c9b0..a4b7e4bc32d4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -34,6 +34,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
struct ext4_inode *raw_inode;
+ void *end;
int free, min_offs;
if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
@@ -57,14 +58,23 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
/* Compute min_offs. */
- for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ while (!IS_LAST_ENTRY(entry)) {
+ void *next = EXT4_XATTR_NEXT(entry);
+
+ if (next >= end) {
+ EXT4_ERROR_INODE(inode,
+ "corrupt xattr in inline inode");
+ return 0;
+ }
if (!entry->e_value_inum && entry->e_value_size) {
size_t offs = le16_to_cpu(entry->e_value_offs);
if (offs < min_offs)
min_offs = offs;
}
+ entry = next;
}
free = min_offs -
((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
@@ -350,7 +360,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
value, len);
- if (error == -ENODATA)
+ if (error < 0)
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
@@ -731,9 +741,8 @@ convert:
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+ unsigned copied, struct folio *folio)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
int no_expand;
void *kaddr;
@@ -813,30 +822,6 @@ out:
return ret ? ret : copied;
}
-struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page)
-{
- int ret, no_expand;
- void *kaddr;
- struct ext4_iloc iloc;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
- return NULL;
- }
-
- ext4_write_lock_xattr(inode, &no_expand);
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
- kunmap_atomic(kaddr);
- ext4_write_unlock_xattr(inode, &no_expand);
-
- return iloc.bh;
-}
-
/*
* Try to make the page cache and handle ready for the inline data case.
* We can call this function in 2 cases:
@@ -1175,6 +1160,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
ext4_initialize_dirent_tail(dir_block,
inode->i_sb->s_blocksize);
set_buffer_uptodate(dir_block);
+ unlock_buffer(dir_block);
err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
if (err)
return err;
@@ -1249,6 +1235,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
if (!S_ISDIR(inode->i_mode)) {
memcpy(data_bh->b_data, buf, inline_size);
set_buffer_uptodate(data_bh);
+ unlock_buffer(data_bh);
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
@@ -1256,7 +1243,6 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
buf, inline_size);
}
- unlock_buffer(data_bh);
out_restore:
if (error)
ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
@@ -1953,16 +1939,8 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
* the extent status cache must be cleared to avoid leaving
* behind stale delayed allocated extent entries
*/
- if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
-retry:
- err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_ATOMIC);
- goto retry;
- }
- if (err)
- goto out_error;
- }
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
/* Clear the content in the xattr space. */
if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d5ba922e411..43775a6ca505 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -567,10 +567,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
- ret = ext4_es_insert_extent(inode, map->m_lblk,
- map->m_len, map->m_pblk, status);
- if (ret < 0)
- retval = ret;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
up_read((&EXT4_I(inode)->i_data_sem));
@@ -632,16 +630,6 @@ found:
*/
ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
-
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now. We don't
- * support fallocate for non extent files. So we can update
- * reserve space here.
- */
- if ((retval > 0) &&
- (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
- ext4_da_update_reserve_space(inode, retval, 1);
}
if (retval > 0) {
@@ -689,12 +677,8 @@ found:
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- if (ret < 0) {
- retval = ret;
- goto out_sem;
- }
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
out_sem:
@@ -1093,7 +1077,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
err = -EIO;
}
if (unlikely(err)) {
- page_zero_new_buffers(&folio->page, from, to);
+ folio_zero_new_buffers(folio, from, to);
} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -1287,7 +1271,8 @@ static int ext4_write_end(struct file *file,
if (ext4_has_inline_data(inode) &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
@@ -1339,7 +1324,7 @@ static int ext4_write_end(struct file *file,
}
/*
- * This is a private version of page_zero_new_buffers() which doesn't
+ * This is a private version of folio_zero_new_buffers() which doesn't
* set the buffer to be dirty, since in data=journalled mode we need
* to call ext4_dirty_journalled_data() instead.
*/
@@ -1395,7 +1380,8 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
if (ext4_has_inline_data(inode))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
copied = 0;
@@ -1638,7 +1624,6 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
- bool reserved = false;
/*
* If the cluster containing lblk is shared with a delayed,
@@ -1654,8 +1639,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
if (sbi->s_cluster_ratio == 1) {
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
- goto errout;
- reserved = true;
+ return ret;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
@@ -1663,12 +1647,11 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_clu_mapped(inode,
EXT4_B2C(sbi, lblk));
if (ret < 0)
- goto errout;
+ return ret;
if (ret == 0) {
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
- goto errout;
- reserved = true;
+ return ret;
} else {
allocated = true;
}
@@ -1678,12 +1661,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
}
}
- ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
- if (ret && reserved)
- ext4_da_release_space(inode, 1);
-
-errout:
- return ret;
+ ext4_es_insert_delayed_block(inode, lblk, allocated);
+ return 0;
}
/*
@@ -1780,7 +1759,6 @@ add_delayed:
set_buffer_new(bh);
set_buffer_delay(bh);
} else if (retval > 0) {
- int ret;
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -1793,10 +1771,8 @@ add_delayed:
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- if (ret != 0)
- retval = ret;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
out_unlock:
@@ -2321,11 +2297,11 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
-static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
- int len)
+static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
+ size_t len)
{
- struct buffer_head *page_bufs = page_buffers(page);
- struct inode *inode = page->mapping->host;
+ struct buffer_head *page_bufs = folio_buffers(folio);
+ struct inode *inode = folio->mapping->host;
int ret, err;
ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
@@ -2334,7 +2310,7 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
NULL, write_end_fn);
if (ret == 0)
ret = err;
- err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+ err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -2344,22 +2320,20 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
static int mpage_journal_page_buffers(handle_t *handle,
struct mpage_da_data *mpd,
- struct page *page)
+ struct folio *folio)
{
struct inode *inode = mpd->inode;
loff_t size = i_size_read(inode);
- int len;
+ size_t len = folio_size(folio);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
mpd->wbc->nr_to_write--;
- if (page->index == size >> PAGE_SHIFT &&
+ if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
+ len = size - folio_pos(folio);
- return ext4_journal_page_buffers(handle, page, len);
+ return ext4_journal_folio_buffers(handle, folio, len);
}
/*
@@ -2499,7 +2473,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
/* Pending dirtying of journalled data? */
if (folio_test_checked(folio)) {
err = mpage_journal_page_buffers(handle,
- mpd, &folio->page);
+ mpd, folio);
if (err < 0)
goto out;
mpd->journalled_more_data = 1;
@@ -2783,11 +2757,12 @@ static int ext4_writepages(struct address_space *mapping,
.can_map = 1,
};
int ret;
+ int alloc_ctx;
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return -EIO;
- percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
/*
* For data=journal writeback we could have come across pages marked
@@ -2796,7 +2771,7 @@ static int ext4_writepages(struct address_space *mapping,
*/
if (!ret && mpd.journalled_more_data)
ret = ext4_do_writepages(&mpd);
- percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ ext4_writepages_up_read(sb, alloc_ctx);
return ret;
}
@@ -2824,17 +2799,18 @@ static int ext4_dax_writepages(struct address_space *mapping,
long nr_to_write = wbc->nr_to_write;
struct inode *inode = mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ int alloc_ctx;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_read(inode->i_sb, alloc_ctx);
return ret;
}
@@ -2942,15 +2918,15 @@ retry:
* Check if we should update i_disksize
* when write to the end of file but not require block allocation
*/
-static int ext4_da_should_update_i_disksize(struct page *page,
+static int ext4_da_should_update_i_disksize(struct folio *folio,
unsigned long offset)
{
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int idx;
int i;
- bh = page_buffers(page);
+ bh = folio_buffers(folio);
idx = offset >> inode->i_blkbits;
for (i = 0; i < idx; i++)
@@ -2970,17 +2946,19 @@ static int ext4_da_write_end(struct file *file,
loff_t new_i_size;
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
+ struct folio *folio = page_folio(page);
if (write_mode == FALL_BACK_TO_NONDELALLOC)
return ext4_write_end(file, mapping, pos,
- len, copied, page, fsdata);
+ len, copied, &folio->page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
if (write_mode != CONVERT_INLINE_DATA &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
ext4_has_inline_data(inode))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
if (unlikely(copied < len) && !PageUptodate(page))
copied = 0;
@@ -3004,10 +2982,11 @@ static int ext4_da_write_end(struct file *file,
*/
new_i_size = pos + copied;
if (copied && new_i_size > inode->i_size &&
- ext4_da_should_update_i_disksize(page, end))
+ ext4_da_should_update_i_disksize(folio, end))
ext4_update_i_disksize(inode, new_i_size);
- return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ return generic_write_end(file, mapping, pos, len, copied, &folio->page,
+ fsdata);
}
/*
@@ -3103,7 +3082,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio)
int ret = -EAGAIN;
struct inode *inode = folio->mapping->host;
- trace_ext4_readpage(&folio->page);
+ trace_ext4_read_folio(inode, folio);
if (ext4_has_inline_data(inode))
ret = ext4_readpage_inline(inode, folio);
@@ -3162,9 +3141,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio,
static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
+ struct inode *inode = folio->mapping->host;
+ journal_t *journal = EXT4_JOURNAL(inode);
- trace_ext4_releasepage(&folio->page);
+ trace_ext4_release_folio(inode, folio);
/* Page has dirty journalled data -> cannot release */
if (folio_test_checked(folio))
@@ -3375,7 +3355,7 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
*/
flags &= ~IOMAP_WRITE;
ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
- WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
+ WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
return ret;
}
@@ -3990,12 +3970,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
- ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
@@ -4639,6 +4615,24 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
+static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
+
+{
+ if (flags & EXT4_IGET_EA_INODE) {
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return "missing EA_INODE flag";
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ EXT4_I(inode)->i_file_acl)
+ return "ea_inode with extended attributes";
+ } else {
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return "unexpected EA_INODE flag";
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
+ return "unexpected bad inode w/o EXT4_IGET_BAD";
+ return NULL;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -4648,6 +4642,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
+ const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4675,8 +4670,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->i_state & I_NEW)) {
+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
+ ext4_error_inode(inode, function, line, 0, err_str);
+ iput(inode);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
return inode;
+ }
ei = EXT4_I(inode);
iloc.bh = NULL;
@@ -4942,10 +4943,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
- ext4_error_inode(inode, function, line, 0,
- "bad inode without EXT4_IGET_BAD flag");
- ret = -EUCLEAN;
+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
+ ext4_error_inode(inode, function, line, 0, err_str);
+ ret = -EFSCORRUPTED;
goto bad_inode;
}
@@ -5928,7 +5928,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
journal_t *journal;
handle_t *handle;
int err;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int alloc_ctx;
/*
* We have to be very careful here: changing a data block's
@@ -5966,7 +5966,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
}
}
- percpu_down_write(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
jbd2_journal_lock_updates(journal);
/*
@@ -5983,7 +5983,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal, 0);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -5991,7 +5991,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
if (val)
filemap_invalidate_unlock(inode->i_mapping);
@@ -6130,7 +6130,7 @@ retry_alloc:
err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
- if (ext4_journal_page_buffers(handle, &folio->page, len))
+ if (ext4_journal_folio_buffers(handle, folio, len))
goto out_error;
} else {
folio_unlock(folio);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f9a430152063..331859511f80 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -793,16 +793,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
}
#endif
-static int ext4_shutdown(struct super_block *sb, unsigned long arg)
+int ext4_force_shutdown(struct super_block *sb, u32 flags)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- __u32 flags;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(flags, (__u32 __user *)arg))
- return -EFAULT;
+ int ret;
if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
return -EINVAL;
@@ -815,7 +809,9 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- freeze_bdev(sb->s_bdev);
+ ret = freeze_bdev(sb->s_bdev);
+ if (ret)
+ return ret;
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
thaw_bdev(sb->s_bdev);
break;
@@ -838,6 +834,19 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
return 0;
}
+static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg)
+{
+ u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return ext4_force_shutdown(sb, flags);
+}
+
struct getfsmap_info {
struct super_block *gi_sb;
struct fsmap_head __user *gi_data;
@@ -1566,7 +1575,7 @@ resizefs_out:
return ext4_ioctl_get_es_cache(filp, arg);
case EXT4_IOC_SHUTDOWN:
- return ext4_shutdown(sb, arg);
+ return ext4_ioctl_shutdown(sb, arg);
case FS_IOC_ENABLE_VERITY:
if (!ext4_has_feature_verity(sb))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 78259bddbc4d..a2475b8c9fb5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -154,19 +154,31 @@
* structures to decide the order in which groups are to be traversed for
* fulfilling an allocation request.
*
- * At CR = 0, we look for groups which have the largest_free_order >= the order
- * of the request. We directly look at the largest free order list in the data
- * structure (1) above where largest_free_order = order of the request. If that
- * list is empty, we look at remaining list in the increasing order of
- * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
+ * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
+ * >= the order of the request. We directly look at the largest free order list
+ * in the data structure (1) above where largest_free_order = order of the
+ * request. If that list is empty, we look at remaining list in the increasing
+ * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
+ * lookup in O(1) time.
*
- * At CR = 1, we only consider groups where average fragment size > request
- * size. So, we lookup a group which has average fragment size just above or
- * equal to request size using our average fragment size group lists (data
- * structure 2) in O(1) time.
+ * At CR_GOAL_LEN_FAST, we only consider groups where
+ * average fragment size > request size. So, we lookup a group which has average
+ * fragment size just above or equal to request size using our average fragment
+ * size group lists (data structure 2) in O(1) time.
+ *
+ * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
+ * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
+ * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
+ * fragment size > goal length. So before falling to the slower
+ * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
+ * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
+ * enough average fragment size. This increases the chances of finding a
+ * suitable block group in O(1) time and results in faster allocation at the
+ * cost of reduced size of allocation.
*
* If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
- * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
+ * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
+ * CR_GOAL_LEN_FAST phase.
*
* The regular allocator (using the buddy cache) supports a few tunables.
*
@@ -351,8 +363,8 @@
* - bitlock on a group (group)
* - object (inode/locality) (object)
* - per-pa lock (pa)
- * - cr0 lists lock (cr0)
- * - cr1 tree lock (cr1)
+ * - cr_power2_aligned lists lock (cr_power2_aligned)
+ * - cr_goal_len_fast lists lock (cr_goal_len_fast)
*
* Paths:
* - new pa
@@ -384,7 +396,7 @@
*
* - allocation path (ext4_mb_regular_allocator)
* group
- * cr0/cr1
+ * cr_power2_aligned/cr_goal_len_fast
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
@@ -409,7 +421,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr);
+ ext4_group_t group, enum criteria cr);
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
@@ -745,6 +757,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
+ if (!grp)
+ return NULL;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -856,8 +870,8 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
* Choose next group by traversing largest_free_order lists. Updates *new_cr if
* cr level needs an update.
*/
-static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
- int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *iter, *grp;
@@ -866,8 +880,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
if (ac->ac_status == AC_STATUS_FOUND)
return;
- if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
- atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
+ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
+ atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
grp = NULL;
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
@@ -882,8 +896,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
bb_largest_free_order_node) {
if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
grp = iter;
break;
}
@@ -895,57 +909,155 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
if (!grp) {
/* Increment cr and search again */
- *new_cr = 1;
+ *new_cr = CR_GOAL_LEN_FAST;
} else {
*group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
+ ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
+ }
+}
+
+/*
+ * Find a suitable group of given order from the average fragments list.
+ */
+static struct ext4_group_info *
+ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
+ rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
+ struct ext4_group_info *grp = NULL, *iter;
+ enum criteria cr = ac->ac_criteria;
+
+ if (list_empty(frag_list))
+ return NULL;
+ read_lock(frag_list_lock);
+ if (list_empty(frag_list)) {
+ read_unlock(frag_list_lock);
+ return NULL;
+ }
+ list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
+ grp = iter;
+ break;
+ }
}
+ read_unlock(frag_list_lock);
+ return grp;
}
/*
* Choose next group by traversing average fragment size list of suitable
* order. Updates *new_cr if cr level needs an update.
*/
-static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
- int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL, *iter;
+ struct ext4_group_info *grp = NULL;
int i;
- if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
+ if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
+ atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
}
for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
- continue;
- read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
- if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
- continue;
- }
- list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
- bb_avg_fragment_size_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
- grp = iter;
- break;
- }
+ grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
+ if (grp)
+ break;
+ }
+
+ if (grp) {
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
+ } else {
+ *new_cr = CR_BEST_AVAIL_LEN;
+ }
+}
+
+/*
+ * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
+ * order we have and proactively trim the goal request length to that order to
+ * find a suitable group faster.
+ *
+ * This optimizes allocation speed at the cost of slightly reduced
+ * preallocations. However, we make sure that we don't trim the request too
+ * much and fall to CR_GOAL_LEN_SLOW in that case.
+ */
+static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = NULL;
+ int i, order, min_order;
+ unsigned long num_stripe_clusters = 0;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
+ if (sbi->s_mb_stats)
+ atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
+ }
+
+ /*
+ * mb_avg_fragment_size_order() returns order in a way that makes
+ * retrieving back the length using (1 << order) inaccurate. Hence, use
+ * fls() instead since we need to know the actual length while modifying
+ * goal length.
+ */
+ order = fls(ac->ac_g_ex.fe_len);
+ min_order = order - sbi->s_mb_best_avail_max_trim_order;
+ if (min_order < 0)
+ min_order = 0;
+
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len) + 1;
+
+ if (sbi->s_stripe > 0) {
+ /*
+ * We are assuming that stripe size is always a multiple of
+ * cluster ratio otherwise __ext4_fill_super exists early.
+ */
+ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
+ if (1 << min_order < num_stripe_clusters)
+ min_order = fls(num_stripe_clusters);
+ }
+
+ for (i = order; i >= min_order; i--) {
+ int frag_order;
+ /*
+ * Scale down goal len to make sure we find something
+ * in the free fragments list. Basically, reduce
+ * preallocations.
+ */
+ ac->ac_g_ex.fe_len = 1 << i;
+
+ if (num_stripe_clusters > 0) {
+ /*
+ * Try to round up the adjusted goal length to
+ * stripe size (in cluster units) multiple for
+ * efficiency.
+ */
+ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
+ num_stripe_clusters);
}
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+
+ frag_order = mb_avg_fragment_size_order(ac->ac_sb,
+ ac->ac_g_ex.fe_len);
+
+ grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
if (grp)
break;
}
if (grp) {
*group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
+ ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
} else {
- *new_cr = 2;
+ /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
+ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ *new_cr = CR_GOAL_LEN_SLOW;
}
}
@@ -953,7 +1065,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
{
if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
return 0;
- if (ac->ac_criteria >= 2)
+ if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
return 0;
if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0;
@@ -998,7 +1110,7 @@ inc_and_return:
* @ngroups Total number of groups
*/
static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
- int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
*new_cr = ac->ac_criteria;
@@ -1007,10 +1119,12 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
return;
}
- if (*new_cr == 0) {
- ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
- } else if (*new_cr == 1) {
- ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
+ if (*new_cr == CR_POWER2_ALIGNED) {
+ ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
+ } else if (*new_cr == CR_GOAL_LEN_FAST) {
+ ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
+ } else if (*new_cr == CR_BEST_AVAIL_LEN) {
+ ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
} else {
/*
* TODO: For CR=2, we can arrange groups in an rb tree sorted by
@@ -1060,9 +1174,9 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
- void *buddy, void *bitmap, ext4_group_t group)
+ void *buddy, void *bitmap, ext4_group_t group,
+ struct ext4_group_info *grp)
{
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
@@ -1181,6 +1295,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
break;
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ continue;
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
@@ -1246,6 +1362,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
group, page->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
@@ -1256,7 +1376,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
ext4_lock_group(sb, group);
/* init the buddy */
memset(data, 0xff, blocksize);
- ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
ext4_unlock_group(sb, group);
incore = NULL;
} else {
@@ -1370,6 +1490,9 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
might_sleep();
mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
+ if (!this_grp)
+ return -EFSCORRUPTED;
+
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
@@ -1444,6 +1567,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return -EFSCORRUPTED;
e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = grp;
@@ -2049,7 +2174,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
if (bex->fe_len < gex->fe_len)
return;
- if (finish_group)
+ if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
ext4_mb_use_best_found(ac, e4b);
}
@@ -2061,6 +2186,20 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
* in the context. Later, the best found extent will be used, if
* mballoc can't find good enough extent.
*
+ * The algorithm used is roughly as follows:
+ *
+ * * If free extent found is exactly as big as goal, then
+ * stop the scan and use it immediately
+ *
+ * * If free extent found is smaller than goal, then keep retrying
+ * upto a max of sbi->s_mb_max_to_scan times (default 200). After
+ * that stop scanning and use whatever we have.
+ *
+ * * If free extent found is bigger than goal, then keep retrying
+ * upto a max of sbi->s_mb_min_to_scan times (default 10) before
+ * stopping the scan and using the extent.
+ *
+ *
* FIXME: real allocation policy is to be designed yet!
*/
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
@@ -2076,6 +2215,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
/*
* The special case - take what you catch first
@@ -2159,6 +2299,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
+ if (!grp)
+ return -EFSCORRUPTED;
if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
return 0;
if (grp->bb_free == 0)
@@ -2178,11 +2320,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
- if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+ if (max >= ac->ac_g_ex.fe_len &&
+ ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
ext4_fsblk_t start;
- start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
- ex.fe_start;
+ start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
/* use do_div to get remainder (would be 64-bit modulo) */
if (do_div(start, sbi->s_stripe) == 0) {
ac->ac_found++;
@@ -2248,6 +2390,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
break;
}
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
ac->ac_b_ex.fe_len = 1 << i;
ac->ac_b_ex.fe_start = k << i;
@@ -2276,7 +2419,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct super_block *sb = ac->ac_sb;
void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
- int i;
+ int i, j, freelen;
int free;
free = e4b->bd_info->bb_free;
@@ -2303,6 +2446,24 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break;
}
+ if (ac->ac_criteria < CR_FAST) {
+ /*
+ * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
+ * sure that this group will have a large enough
+ * continuous free extent, so skip over the smaller free
+ * extents
+ */
+ j = mb_find_next_bit(bitmap,
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ freelen = j - i;
+
+ if (freelen < ac->ac_g_ex.fe_len) {
+ i = j;
+ free -= freelen;
+ continue;
+ }
+ }
+
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
if (WARN_ON(ex.fe_len <= 0))
break;
@@ -2344,7 +2505,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
- ext4_grpblk_t i;
+ ext4_grpblk_t i, stripe;
int max;
BUG_ON(sbi->s_stripe == 0);
@@ -2356,18 +2517,21 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
+ stripe = EXT4_B2C(sbi, sbi->s_stripe);
+ i = EXT4_B2C(sbi, i);
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
- if (max >= sbi->s_stripe) {
+ max = mb_find_extent(e4b, i, stripe, &ex);
+ if (max >= stripe) {
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
}
}
- i += sbi->s_stripe;
+ i += stripe;
}
}
@@ -2377,15 +2541,15 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
* for the allocation or not.
*/
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
+ ext4_group_t group, enum criteria cr)
{
ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
- BUG_ON(cr < 0 || cr >= 4);
+ BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
return false;
free = grp->bb_free;
@@ -2397,7 +2561,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false;
switch (cr) {
- case 0:
+ case CR_POWER2_ALIGNED:
BUG_ON(ac->ac_2order == 0);
/* Avoid using the first bg of a flexgroup for data files */
@@ -2416,15 +2580,16 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false;
return true;
- case 1:
+ case CR_GOAL_LEN_FAST:
+ case CR_BEST_AVAIL_LEN:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return true;
break;
- case 2:
+ case CR_GOAL_LEN_SLOW:
if (free >= ac->ac_g_ex.fe_len)
return true;
break;
- case 3:
+ case CR_ANY_FREE:
return true;
default:
BUG();
@@ -2445,7 +2610,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
* out"!
*/
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
+ ext4_group_t group, enum criteria cr)
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
@@ -2454,6 +2619,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_grpblk_t free;
int ret = 0;
+ if (!grp)
+ return -EFSCORRUPTED;
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
if (should_lock) {
@@ -2463,7 +2630,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
free = grp->bb_free;
if (free == 0)
goto out;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len)
goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out;
@@ -2478,15 +2645,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_get_group_desc(sb, group, NULL);
int ret;
- /* cr=0/1 is a very optimistic search to find large
- * good chunks almost for free. If buddy data is not
- * ready, then this optimization makes no sense. But
- * we never skip the first block group in a flex_bg,
- * since this gets used for metadata block allocation,
- * and we want to make sure we locate metadata blocks
- * in the first block group in the flex_bg if possible.
+ /*
+ * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
+ * search to find large good chunks almost for free. If buddy
+ * data is not ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg, since this
+ * gets used for metadata block allocation, and we want to make
+ * sure we locate metadata blocks in the first block group in
+ * the flex_bg if possible.
*/
- if (cr < 2 &&
+ if (cr < CR_FAST &&
(!sbi->s_log_groups_per_flex ||
((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
!(ext4_has_group_desc_csum(sb) &&
@@ -2534,11 +2702,9 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
* prefetch once, so we avoid getblk() call, which can
* be expensive.
*/
- if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
EXT4_MB_GRP_NEED_INIT(grp) &&
- ext4_free_group_clusters(sb, gdp) > 0 &&
- !(ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ ext4_free_group_clusters(sb, gdp) > 0 ) {
bh = ext4_read_block_bitmap_nowait(sb, group, true);
if (bh && !IS_ERR(bh)) {
if (!buffer_uptodate(bh) && cnt)
@@ -2578,10 +2744,8 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
gdp = ext4_get_group_desc(sb, group, NULL);
grp = ext4_get_group_info(sb, group);
- if (EXT4_MB_GRP_NEED_INIT(grp) &&
- ext4_free_group_clusters(sb, gdp) > 0 &&
- !(ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0) {
if (ext4_mb_init_group(sb, group, GFP_NOFS))
break;
}
@@ -2592,7 +2756,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t prefetch_grp = 0, ngroups, group, i;
- int cr = -1, new_cr;
+ enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
@@ -2649,14 +2813,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
spin_unlock(&sbi->s_md_lock);
}
- /* Let's just scan groups to find more-less suitable blocks */
- cr = ac->ac_2order ? 0 : 1;
/*
- * cr == 0 try to get exact allocation,
- * cr == 3 try to get anything
+ * Let's just scan groups to find more-less suitable blocks We
+ * start with CR_GOAL_LEN_FAST, unless it is power of 2
+ * aligned, in which case let's do that faster approach first.
*/
+ if (ac->ac_2order)
+ cr = CR_POWER2_ALIGNED;
repeat:
- for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+ for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
/*
* searching for the right group start
@@ -2683,10 +2848,8 @@ repeat:
* spend a lot of time loading imperfect groups
*/
if ((prefetch_grp == group) &&
- (cr > 1 ||
+ (cr >= CR_FAST ||
prefetch_ios < sbi->s_mb_prefetch_limit)) {
- unsigned int curr_ios = prefetch_ios;
-
nr = sbi->s_mb_prefetch;
if (ext4_has_feature_flex_bg(sb)) {
nr = 1 << sbi->s_log_groups_per_flex;
@@ -2695,8 +2858,6 @@ repeat:
}
prefetch_grp = ext4_mb_prefetch(sb, group,
nr, &prefetch_ios);
- if (prefetch_ios == curr_ios)
- nr = 0;
}
/* This now checks without needing the buddy page */
@@ -2725,10 +2886,13 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b);
- else if (cr == 1 && sbi->s_stripe &&
- !(ac->ac_g_ex.fe_len % sbi->s_stripe))
+ else if ((cr == CR_GOAL_LEN_FAST ||
+ cr == CR_BEST_AVAIL_LEN) &&
+ sbi->s_stripe &&
+ !(ac->ac_g_ex.fe_len %
+ EXT4_B2C(sbi, sbi->s_stripe)))
ext4_mb_scan_aligned(ac, &e4b);
else
ext4_mb_complex_scan_group(ac, &e4b);
@@ -2742,6 +2906,11 @@ repeat:
/* Processed all groups and haven't found blocks */
if (sbi->s_mb_stats && i == ngroups)
atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+
+ if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
+ /* Reset goal length to original goal length before
+ * falling into CR_GOAL_LEN_SLOW */
+ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2767,7 +2936,7 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
- cr = 3;
+ cr = CR_ANY_FREE;
goto repeat;
}
}
@@ -2837,6 +3006,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ return 0;
/* Load the group info in memory only if not already loaded. */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2847,7 +3018,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
buddy_loaded = 1;
}
- memcpy(&sg, ext4_get_group_info(sb, group), i);
+ memcpy(&sg, grinfo, i);
if (buddy_loaded)
ext4_mb_unload_buddy(&e4b);
@@ -2881,51 +3052,94 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_puts(seq, "mballoc:\n");
if (!sbi->s_mb_stats) {
seq_puts(seq, "\tmb stats collection turned off.\n");
- seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ seq_puts(
+ seq,
+ "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
return 0;
}
seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
- seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
-
- seq_puts(seq, "\tcr0_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
- seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
+ seq_printf(seq, "\tgroups_scanned: %u\n",
+ atomic_read(&sbi->s_bal_groups_scanned));
+
+ /* CR_POWER2_ALIGNED stats */
+ seq_puts(seq, "\tcr_p2_aligned_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[0]));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_cr0_bad_suggestions));
+ atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
- seq_puts(seq, "\tcr1_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
+ /* CR_GOAL_LEN_FAST stats */
+ seq_puts(seq, "\tcr_goal_fast_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[1]));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_cr1_bad_suggestions));
-
- seq_puts(seq, "\tcr2_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
- seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
+ atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
+
+ /* CR_BEST_AVAIL_LEN stats */
+ seq_puts(seq, "\tcr_best_avail_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[2]));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
+ seq_printf(seq, "\t\tbad_suggestions: %u\n",
+ atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
- seq_puts(seq, "\tcr3_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
+ /* CR_GOAL_LEN_SLOW stats */
+ seq_puts(seq, "\tcr_goal_slow_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[3]));
- seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
+
+ /* CR_ANY_FREE stats */
+ seq_puts(seq, "\tcr_any_free_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
+
+ /* Aggregates */
+ seq_printf(seq, "\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_ex_scanned));
seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\tlen_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_len_goals));
seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
-
seq_printf(seq, "\tbuddies_generated: %u/%u\n",
atomic_read(&sbi->s_mb_buddies_generated),
ext4_get_groups_count(sb));
@@ -2933,8 +3147,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic64_read(&sbi->s_mb_generation_time));
seq_printf(seq, "\tpreallocated: %u\n",
atomic_read(&sbi->s_mb_preallocated));
- seq_printf(seq, "\tdiscarded: %u\n",
- atomic_read(&sbi->s_mb_discarded));
+ seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
return 0;
}
@@ -3208,8 +3421,12 @@ static int ext4_mb_init_backend(struct super_block *sb)
err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- while (i-- > 0)
- kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+ while (i-- > 0) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+
+ if (grp)
+ kmem_cache_free(cachep, grp);
+ }
i = sbi->s_group_info_size;
rcu_read_lock();
group_info = rcu_dereference(sbi->s_group_info);
@@ -3417,6 +3634,8 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
+
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3441,7 +3660,7 @@ int ext4_mb_init(struct super_block *sb)
*/
if (sbi->s_stripe > 1) {
sbi->s_mb_group_prealloc = roundup(
- sbi->s_mb_group_prealloc, sbi->s_stripe);
+ sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
}
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -3522,6 +3741,8 @@ int ext4_mb_release(struct super_block *sb)
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
+ if (!grinfo)
+ continue;
mb_group_bb_bitmap_free(grinfo);
ext4_lock_group(sb, i);
count = ext4_mb_cleanup_pa(grinfo);
@@ -4244,7 +4465,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(22 - bsbits)) << 22;
size = 4 * 1024 * 1024;
- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+ } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
(8<<20)>>bsbits, max, 8 * 1024)) {
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(23 - bsbits)) << 23;
@@ -4318,6 +4539,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
+ ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size)) &&
@@ -4351,11 +4573,20 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
+
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
+ atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
+ }
+
atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
atomic_inc(&sbi->s_bal_goals);
+ /* did we allocate as much as normalizer originally wanted? */
+ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
+ atomic_inc(&sbi->s_bal_len_goals);
+
if (ac->ac_found > sbi->s_mb_max_to_scan)
atomic_inc(&sbi->s_bal_breaks);
}
@@ -4490,6 +4721,37 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
}
/*
+ * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
+ */
+static bool
+ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_fsblk_t start;
+
+ if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
+ return true;
+
+ /*
+ * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
+ * in ext4_mb_normalize_request and will keep same with ac_o_ex
+ * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
+ * consistent with ext4_mb_find_by_goal.
+ */
+ start = pa->pa_pstart +
+ (ac->ac_g_ex.fe_logical - pa->pa_lstart);
+ if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
+ return false;
+
+ if (ac->ac_g_ex.fe_len > pa->pa_len -
+ EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
+ return false;
+
+ return true;
+}
+
+/*
* search goal blocks in preallocated space
*/
static noinline_for_stack bool
@@ -4539,11 +4801,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* found preallocated blocks, use them */
spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) {
+ if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free &&
+ likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
atomic_inc(&tmp_pa->pa_count);
ext4_mb_use_inode_pa(ac, tmp_pa);
spin_unlock(&tmp_pa->pa_lock);
- ac->ac_criteria = 10;
read_unlock(&ei->i_prealloc_lock);
return true;
}
@@ -4586,7 +4848,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
}
if (cpa) {
ext4_mb_use_group_pa(ac, cpa);
- ac->ac_criteria = 20;
return true;
}
return false;
@@ -4606,6 +4867,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
struct ext4_free_data *entry;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return;
n = rb_first(&(grp->bb_free_root));
while (n) {
@@ -4633,6 +4896,9 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
int preallocated = 0;
int len;
+ if (!grp)
+ return;
+
/* all form of preallocation discards first load group,
* so the only competing code is preallocation use.
* we don't need any locking here
@@ -4805,7 +5071,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
- if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+ if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
int new_bex_start;
int new_bex_end;
@@ -4820,14 +5086,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
* fragmentation in check while ensuring logical range of best
* extent doesn't overflow out of goal extent:
*
- * 1. Check if best ex can be kept at end of goal and still
- * cover original start
+ * 1. Check if best ex can be kept at end of goal (before
+ * cr_best_avail trimmed it) and still cover original start
* 2. Else, check if best ex can be kept at start of goal and
* still cover original start
* 3. Else, keep the best ex at start of original request.
*/
new_bex_end = ac->ac_g_ex.fe_logical +
- EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
+ EXT4_C2B(sbi, ac->ac_orig_goal_len);
new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (ac->ac_o_ex.fe_logical >= new_bex_start)
goto adjust_bex;
@@ -4848,7 +5114,7 @@ adjust_bex:
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
- EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
+ EXT4_C2B(sbi, ac->ac_orig_goal_len)));
}
pa->pa_lstart = ac->ac_b_ex.fe_logical;
@@ -4869,6 +5135,8 @@ adjust_bex:
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;
@@ -4918,6 +5186,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
lg = ac->ac_lg;
BUG_ON(lg == NULL);
@@ -5013,7 +5283,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
+ e4b->bd_group, group, pa->pa_pstart);
+ return 0;
+ }
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
@@ -5043,6 +5317,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
int err;
int free = 0;
+ if (!grp)
+ return 0;
mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
goto out_dbg;
@@ -5297,6 +5573,9 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
struct list_head *cur;
+
+ if (!grp)
+ continue;
ext4_lock_group(sb, i);
list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space,
@@ -5342,6 +5621,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
mb_debug(sb, "%u found", ac->ac_found);
+ mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
+ if (ac->ac_pa)
+ mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
+ "group pa" : "inode pa");
ext4_mb_show_pa(sb);
}
#else
@@ -5451,6 +5734,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
ac->ac_g_ex = ac->ac_o_ex;
+ ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
ac->ac_flags = ar->flags;
/* we have to define context: we'll work with a file or
@@ -5694,8 +5978,72 @@ out_dbg:
return ret;
}
-static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
- struct ext4_allocation_request *ar, int *errp);
+/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t
+ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group, nr;
+ ext4_grpblk_t blkoff;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
+ blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, i))) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i < max)
+ break;
+
+ if (++group >= ext4_get_groups_count(sb))
+ group = 0;
+
+ blkoff = 0;
+ }
+
+ if (i >= max) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+
+ block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
+ ext4_mb_mark_bb(sb, block, 1, 1);
+ ar->len = 1;
+
+ return block;
+}
/*
* Main entry point into mballoc to allocate blocks
@@ -5720,7 +6068,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
trace_ext4_request_blocks(ar);
if (sbi->s_mount_state & EXT4_FC_REPLAY)
- return ext4_mb_new_blocks_simple(handle, ar, errp);
+ return ext4_mb_new_blocks_simple(ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -5944,68 +6292,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
spin_unlock(&sbi->s_md_lock);
}
-/*
- * Simple allocator for Ext4 fast commit replay path. It searches for blocks
- * linearly starting at the goal block and also excludes the blocks which
- * are going to be in use after fast commit replay.
- */
-static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
- struct ext4_allocation_request *ar, int *errp)
-{
- struct buffer_head *bitmap_bh;
- struct super_block *sb = ar->inode->i_sb;
- ext4_group_t group;
- ext4_grpblk_t blkoff;
- ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
- ext4_grpblk_t i = 0;
- ext4_fsblk_t goal, block;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
- goal = ar->goal;
- if (goal < le32_to_cpu(es->s_first_data_block) ||
- goal >= ext4_blocks_count(es))
- goal = le32_to_cpu(es->s_first_data_block);
-
- ar->len = 0;
- ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
- for (; group < ext4_get_groups_count(sb); group++) {
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- *errp = PTR_ERR(bitmap_bh);
- pr_warn("Failed to read block bitmap\n");
- return 0;
- }
-
- while (1) {
- i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
- blkoff);
- if (i >= max)
- break;
- if (ext4_fc_replay_check_excluded(sb,
- ext4_group_first_block_no(sb, group) + i)) {
- blkoff = i + 1;
- } else
- break;
- }
- brelse(bitmap_bh);
- if (i < max)
- break;
-
- blkoff = 0;
- }
-
- if (group >= ext4_get_groups_count(sb) || i >= max) {
- *errp = -ENOSPC;
- return 0;
- }
-
- block = ext4_group_first_block_no(sb, group) + i;
- ext4_mb_mark_bb(sb, block, 1, 1);
- ar->len = 1;
-
- return block;
-}
-
static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
unsigned long count)
{
@@ -6064,6 +6350,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
+ struct ext4_group_info *grp;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
@@ -6089,8 +6376,8 @@ do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
- ext4_get_group_info(sb, block_group))))
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return;
/*
@@ -6185,8 +6472,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count,
- NULL);
+ err = ext4_issue_discard(sb, block_group, bit,
+ count_clusters, NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%u block:%d count:%lu failed"
@@ -6270,12 +6557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
- if (sbi->s_mount_state & EXT4_FC_REPLAY) {
- ext4_free_blocks_simple(inode, block, count);
- return;
- }
-
- might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
@@ -6283,6 +6564,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
block = bh->b_blocknr;
}
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
+ return;
+ }
+
+ might_sleep();
+
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
@@ -6692,6 +6980,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ continue;
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
ret = ext4_mb_init_group(sb, group, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6d85ee8674a6..df6b5e7c2274 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -49,7 +49,7 @@
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
- * with 'ext4_mb_stats' allocator will collect stats that will be
+ * with 's_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
#define MB_DEFAULT_STATS 0
@@ -86,6 +86,13 @@
#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
/*
+ * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular
+ * allocation request. Example, if we have an order 7 request and max trim order
+ * of 3, we can trim this request upto order 4.
+ */
+#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3
+
+/*
* Number of valid buddy orders
*/
#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
@@ -179,11 +186,18 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
+ /*
+ * goal len can change in CR1.5, so save the original len. This is
+ * used while adjusting the PA window and for accounting.
+ */
+ ext4_grpblk_t ac_orig_goal_len;
+
__u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
__u16 ac_groups_linear_remaining;
__u16 ac_found;
+ __u16 ac_cX_found[EXT4_MB_NUM_CRS];
__u16 ac_tail;
__u16 ac_buddy;
__u8 ac_status;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a19a9661646e..d98ac2af8199 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -408,7 +408,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
@@ -418,6 +417,7 @@ int ext4_ext_migrate(struct inode *inode)
unsigned long max_entries;
__u32 goal, tmp_csum_seed;
uid_t owner[2];
+ int alloc_ctx;
/*
* If the filesystem does not support extents, or the inode
@@ -434,7 +434,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
- percpu_down_write(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
/*
* Worst case we can touch the allocation bitmaps and a block
@@ -586,7 +586,7 @@ out_tmp_inode:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
out_unlock:
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return retval;
}
@@ -605,6 +605,7 @@ int ext4_ind_migrate(struct inode *inode)
ext4_fsblk_t blk;
handle_t *handle;
int ret, ret2 = 0;
+ int alloc_ctx;
if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
@@ -621,7 +622,7 @@ int ext4_ind_migrate(struct inode *inode)
if (test_opt(inode->i_sb, DELALLOC))
ext4_alloc_da_blocks(inode);
- percpu_down_write(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
@@ -665,6 +666,6 @@ errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
out_unlock:
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 4022bc713421..0aaf38ffcb6e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -39,28 +39,36 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
* Write the MMP block using REQ_SYNC to try to get the block on-disk
* faster.
*/
-static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+static int write_mmp_block_thawed(struct super_block *sb,
+ struct buffer_head *bh)
{
struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
- /*
- * We protect against freezing so that we don't create dirty buffers
- * on frozen filesystem.
- */
- sb_start_write(sb);
ext4_mmp_csum_set(sb, mmp);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
- sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
return -EIO;
-
return 0;
}
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+ int err;
+
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ sb_start_write(sb);
+ err = write_mmp_block_thawed(sb, bh);
+ sb_end_write(sb);
+ return err;
+}
+
/*
* Read the MMP block. It _must_ be read from disk and hence we clear the
* uptodate flag on the buffer.
@@ -344,7 +352,11 @@ skip:
seq = mmp_new_seq();
mmp->mmp_seq = cpu_to_le32(seq);
- retval = write_mmp_block(sb, bh);
+ /*
+ * On mount / remount we are protected against fs freezing (by s_umount
+ * semaphore) and grabbing freeze protection upsets lockdep
+ */
+ retval = write_mmp_block_thawed(sb, bh);
if (retval)
goto failed;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a5010b5b8a8c..0caf6c730ce3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -674,7 +674,7 @@ static struct stats dx_show_leaf(struct inode *dir,
len = de->name_len;
if (!IS_ENCRYPTED(dir)) {
/* Directory is not encrypted */
- ext4fs_dirhash(dir, de->name,
+ (void) ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(U)%x.%u ", len,
name, h.hash,
@@ -709,8 +709,9 @@ static struct stats dx_show_leaf(struct inode *dir,
if (IS_CASEFOLDED(dir))
h.hash = EXT4_DIRENT_HASH(de);
else
- ext4fs_dirhash(dir, de->name,
- de->name_len, &h);
+ (void) ext4fs_dirhash(dir,
+ de->name,
+ de->name_len, &h);
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
- base));
@@ -720,7 +721,8 @@ static struct stats dx_show_leaf(struct inode *dir,
#else
int len = de->name_len;
char *name = de->name;
- ext4fs_dirhash(dir, de->name, de->name_len, &h);
+ (void) ext4fs_dirhash(dir, de->name,
+ de->name_len, &h);
printk("%*.s:%x.%u ", len, name, h.hash,
(unsigned) ((char *) de - base));
#endif
@@ -849,8 +851,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
/* hash is already computed for encrypted casefolded directory */
if (fname && fname_name(fname) &&
- !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)))
- ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
+ !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) {
+ int ret = ext4fs_dirhash(dir, fname_name(fname),
+ fname_len(fname), hinfo);
+ if (ret < 0) {
+ ret_err = ERR_PTR(ret);
+ goto fail;
+ }
+ }
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
@@ -1111,7 +1119,12 @@ static int htree_dirblock_to_tree(struct file *dir_file,
hinfo->minor_hash = 0;
}
} else {
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ err = ext4fs_dirhash(dir, de->name,
+ de->name_len, hinfo);
+ if (err < 0) {
+ count = err;
+ goto errout;
+ }
}
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
@@ -1313,8 +1326,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
if (de->name_len && de->inode) {
if (ext4_hash_in_dirent(dir))
h.hash = EXT4_DIRENT_HASH(de);
- else
- ext4fs_dirhash(dir, de->name, de->name_len, &h);
+ else {
+ int err = ext4fs_dirhash(dir, de->name,
+ de->name_len, &h);
+ if (err < 0)
+ return err;
+ }
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
@@ -1452,10 +1469,9 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
hinfo->hash_version = DX_HASH_SIPHASH;
hinfo->seed = NULL;
if (cf_name->name)
- ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo);
+ return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo);
else
- ext4fs_dirhash(dir, iname->name, iname->len, hinfo);
- return 0;
+ return ext4fs_dirhash(dir, iname->name, iname->len, hinfo);
}
#endif
@@ -2298,10 +2314,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
/* casefolded encrypted hashes are computed on fname setup */
- if (!ext4_hash_in_dirent(dir))
- ext4fs_dirhash(dir, fname_name(fname),
- fname_len(fname), &fname->hinfo);
-
+ if (!ext4_hash_in_dirent(dir)) {
+ int err = ext4fs_dirhash(dir, fname_name(fname),
+ fname_len(fname), &fname->hinfo);
+ if (err < 0) {
+ brelse(bh2);
+ brelse(bh);
+ return err;
+ }
+ }
memset(frames, 0, sizeof(frames));
frame = frames;
frame->entries = entries;
@@ -3813,19 +3834,10 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return retval;
}
- /*
- * We need to protect against old.inode directory getting converted
- * from inline directory format into a normal one.
- */
- if (S_ISDIR(old.inode->i_mode))
- inode_lock_nested(old.inode, I_MUTEX_NONDIR2);
-
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
&old.inlined);
- if (IS_ERR(old.bh)) {
- retval = PTR_ERR(old.bh);
- goto unlock_moved_dir;
- }
+ if (IS_ERR(old.bh))
+ return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
@@ -4022,10 +4034,6 @@ release_bh:
brelse(old.bh);
brelse(new.bh);
-unlock_moved_dir:
- if (S_ISDIR(old.inode->i_mode))
- inode_unlock(old.inode);
-
return retval;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 6f46823fba61..3e7d160f543f 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode,
folio_size(folio));
if (first_hole == 0) {
if (ext4_need_verity(inode, folio->index) &&
- !fsverity_verify_page(&folio->page))
+ !fsverity_verify_folio(folio))
goto set_error_page;
folio_mark_uptodate(folio);
folio_unlock(folio);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d39f386e9baf..c94ebf704616 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1048,6 +1048,8 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
int ret;
+ if (!grp || !gdp)
+ return;
if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
&grp->bb_state);
@@ -1094,6 +1096,15 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
+static void ext4_bdev_mark_dead(struct block_device *bdev)
+{
+ ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH);
+}
+
+static const struct blk_holder_ops ext4_holder_ops = {
+ .mark_dead = ext4_bdev_mark_dead,
+};
+
/*
* Open the external journal device
*/
@@ -1101,7 +1112,8 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
- bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+ bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
+ &ext4_holder_ops);
if (IS_ERR(bdev))
goto fail;
return bdev;
@@ -1116,17 +1128,18 @@ fail:
/*
* Release the journal device
*/
-static void ext4_blkdev_put(struct block_device *bdev)
-{
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
bdev = sbi->s_journal_bdev;
if (bdev) {
- ext4_blkdev_put(bdev);
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ invalidate_bdev(bdev);
+ blkdev_put(bdev, sbi->s_sb);
sbi->s_journal_bdev = NULL;
}
}
@@ -1157,12 +1170,12 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);
-static inline void ext4_quota_off_umount(struct super_block *sb)
+static inline void ext4_quotas_off(struct super_block *sb, int type)
{
- int type;
+ BUG_ON(type > EXT4_MAXQUOTAS);
/* Use our quota_off function to clear inode flags etc. */
- for (type = 0; type < EXT4_MAXQUOTAS; type++)
+ for (type--; type >= 0; type--)
ext4_quota_off(sb, type);
}
@@ -1178,7 +1191,7 @@ static inline char *get_qf_name(struct super_block *sb,
lockdep_is_held(&sb->s_umount));
}
#else
-static inline void ext4_quota_off_umount(struct super_block *sb)
+static inline void ext4_quotas_off(struct super_block *sb, int type)
{
}
#endif
@@ -1278,7 +1291,7 @@ static void ext4_put_super(struct super_block *sb)
&sb->s_uuid);
ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
+ ext4_quotas_off(sb, EXT4_MAXQUOTAS);
flush_work(&sbi->s_error_work);
destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1325,14 +1338,8 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
+ if (sbi->s_journal_bdev) {
sync_blockdev(sbi->s_journal_bdev);
- invalidate_bdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -1447,6 +1454,11 @@ static void ext4_destroy_inode(struct inode *inode)
EXT4_I(inode)->i_reserved_data_blocks);
}
+static void ext4_shutdown(struct super_block *sb)
+{
+ ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
+}
+
static void init_once(void *foo)
{
struct ext4_inode_info *ei = foo;
@@ -1607,6 +1619,7 @@ static const struct super_operations ext4_sops = {
.unfreeze_fs = ext4_unfreeze,
.statfs = ext4_statfs,
.show_options = ext4_show_options,
+ .shutdown = ext4_shutdown,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
@@ -3238,11 +3251,9 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
crc = crc16(crc, (__u8 *)gdp, offset);
offset += sizeof(gdp->bg_checksum); /* skip checksum */
/* for checksum of struct ext4_group_desc do the rest...*/
- if (ext4_has_feature_64bit(sb) &&
- offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
crc = crc16(crc, (__u8 *)gdp + offset,
- le16_to_cpu(sbi->s_es->s_desc_size) -
- offset);
+ sbi->s_desc_size - offset);
out:
return cpu_to_le16(crc);
@@ -3692,16 +3703,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ext4_group_t group = elr->lr_next_group;
unsigned int prefetch_ios = 0;
int ret = 0;
+ int nr = EXT4_SB(sb)->s_mb_prefetch;
u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
- elr->lr_next_group = ext4_mb_prefetch(sb, group,
- EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
- if (prefetch_ios)
- ext4_mb_prefetch_fini(sb, elr->lr_next_group,
- prefetch_ios);
- trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
- prefetch_ios);
+ elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
if (group >= elr->lr_next_group) {
ret = 1;
if (elr->lr_first_not_zeroed != ngroups &&
@@ -5297,6 +5305,19 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount3;
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ /*
+ * It's hard to get stripe aligned blocks if stripe is not aligned with
+ * cluster, just disable stripe and alert user to simpfy code and avoid
+ * stripe aligned allocation which will rarely successes.
+ */
+ if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
+ sbi->s_stripe % sbi->s_cluster_ratio != 0) {
+ ext4_msg(sb, KERN_WARNING,
+ "stripe (%lu) is not aligned with cluster size (%u), "
+ "stripe is disabled",
+ sbi->s_stripe, sbi->s_cluster_ratio);
+ sbi->s_stripe = 0;
+ }
sbi->s_extent_max_zeroout_kb = 32;
/*
@@ -5567,7 +5588,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
if (err)
- goto failed_mount9;
+ goto failed_mount10;
}
if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
@@ -5586,7 +5607,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
return 0;
-failed_mount9:
+failed_mount10:
+ ext4_quotas_off(sb, EXT4_MAXQUOTAS);
+failed_mount9: __maybe_unused
ext4_release_orphan_info(sb);
failed_mount8:
ext4_unregister_sysfs(sb);
@@ -5645,6 +5668,7 @@ failed_mount:
brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
out_fail:
+ invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
return err;
}
@@ -5684,8 +5708,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
descr = "out journal";
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. "
- "Quota mode: %s.", &sb->s_uuid, descr,
+ ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
+ "Quota mode: %s.", &sb->s_uuid,
+ sb_rdonly(sb) ? "ro" : "r/w", descr,
ext4_quota_mode(sb));
/* Update the s_overhead_clusters if necessary */
@@ -5726,6 +5751,11 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
else
journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+ /*
+ * Always enable journal cycle record option, letting the journal
+ * records log transactions continuously between each mount.
+ */
+ journal->j_flags |= JBD2_CYCLE_RECORD;
write_unlock(&journal->j_state_lock);
}
@@ -5898,7 +5928,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- ext4_blkdev_put(bdev);
+ blkdev_put(bdev, sb);
return NULL;
}
@@ -5978,19 +6008,27 @@ static int ext4_load_journal(struct super_block *sb,
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+ __le16 orig_state;
+ bool changed = false;
if (save)
memcpy(save, ((char *) es) +
EXT4_S_ERR_START, EXT4_S_ERR_LEN);
err = jbd2_journal_load(journal);
- if (save)
+ if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
+ save, EXT4_S_ERR_LEN)) {
memcpy(((char *) es) + EXT4_S_ERR_START,
save, EXT4_S_ERR_LEN);
+ changed = true;
+ }
kfree(save);
+ orig_state = es->s_state;
es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
EXT4_ERROR_FS);
+ if (orig_state != es->s_state)
+ changed = true;
/* Write out restored error information to the superblock */
- if (!bdev_read_only(sb->s_bdev)) {
+ if (changed && !really_read_only) {
int err2;
err2 = ext4_commit_super(sb);
err = err ? : err2;
@@ -6587,18 +6625,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
/*
- * Reinitialize lazy itable initialization thread based on
- * current settings
- */
- if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
- ext4_unregister_li_request(sb);
- else {
- ext4_group_t first_not_zeroed;
- first_not_zeroed = ext4_has_uninit_itable(sb);
- ext4_register_li_request(sb, first_not_zeroed);
- }
-
- /*
* Handle creation of system zone data early because it can fail.
* Releasing of existing data is done when we are sure remount will
* succeed.
@@ -6616,9 +6642,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
#ifdef CONFIG_QUOTA
- /* Release old quota file names */
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
- kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
@@ -6628,16 +6651,38 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
goto restore_opts;
}
}
+ /* Release old quota file names */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(old_opts.s_qf_names[i]);
#endif
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
+ /*
+ * Reinitialize lazy itable initialization thread based on
+ * current settings
+ */
+ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
+ ext4_unregister_li_request(sb);
+ else {
+ ext4_group_t first_not_zeroed;
+ first_not_zeroed = ext4_has_uninit_itable(sb);
+ ext4_register_li_request(sb, first_not_zeroed);
+ }
+
if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
ext4_stop_mmpd(sbi);
return 0;
restore_opts:
+ /*
+ * If there was a failing r/w to ro transition, we may need to
+ * re-enable quota
+ */
+ if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
+ sb_any_quota_suspended(sb))
+ dquot_resume(sb, -1);
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -6678,8 +6723,9 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.",
- &sb->s_uuid, ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
+ &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
+ ext4_quota_mode(sb));
return 0;
}
@@ -7017,20 +7063,8 @@ int ext4_enable_quotas(struct super_block *sb)
"(type=%d, err=%d, ino=%lu). "
"Please run e2fsck to fix.", type,
err, qf_inums[type]);
- for (type--; type >= 0; type--) {
- struct inode *inode;
-
- inode = sb_dqopt(sb)->files[type];
- if (inode)
- inode = igrab(inode);
- dquot_quota_off(sb, type);
- if (inode) {
- lockdep_set_quota_inode(inode,
- I_DATA_SEM_NORMAL);
- iput(inode);
- }
- }
+ ext4_quotas_off(sb, type);
return err;
}
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 3042bc605bbf..6d332dff79dd 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
@@ -273,6 +274,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(warning_ratelimit_burst),
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(mb_best_avail_max_trim_order),
ATTR_LIST(errors_count),
ATTR_LIST(warning_count),
ATTR_LIST(msg_count),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index dadad29bd81b..321e3a888c20 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -121,7 +121,11 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
{
+ struct ext4_inode_info *ei = EXT4_I(ea_inode);
+
lockdep_set_subclass(&ea_inode->i_rwsem, 1);
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA);
}
#endif
@@ -433,7 +437,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
return -EFSCORRUPTED;
}
- inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
+ inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(parent->i_sb,
@@ -441,23 +445,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
err);
return err;
}
-
- if (is_bad_inode(inode)) {
- ext4_error(parent->i_sb,
- "error while reading EA inode %lu is_bad_inode",
- ea_ino);
- err = -EIO;
- goto error;
- }
-
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
- ext4_error(parent->i_sb,
- "EA inode %lu does not have EXT4_EA_INODE_FL flag",
- ea_ino);
- err = -EINVAL;
- goto error;
- }
-
ext4_xattr_inode_set_class(inode);
/*
@@ -478,9 +465,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
*ea_inode = inode;
return 0;
-error:
- iput(inode);
- return err;
}
/* Remove entry from mbcache when EA inode is getting evicted */
@@ -1556,11 +1540,11 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
while (ce) {
ea_inode = ext4_iget(inode->i_sb, ce->e_value,
- EXT4_IGET_NORMAL);
- if (!IS_ERR(ea_inode) &&
- !is_bad_inode(ea_inode) &&
- (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
- i_size_read(ea_inode) == value_len &&
+ EXT4_IGET_EA_INODE);
+ if (IS_ERR(ea_inode))
+ goto next_entry;
+ ext4_xattr_inode_set_class(ea_inode);
+ if (i_size_read(ea_inode) == value_len &&
!ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
!ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
value_len) &&
@@ -1570,9 +1554,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
kvfree(ea_data);
return ea_inode;
}
-
- if (!IS_ERR(ea_inode))
- iput(ea_inode);
+ iput(ea_inode);
+ next_entry:
ce = mb_cache_entry_find_next(ea_inode_cache, ce);
}
kvfree(ea_data);
@@ -2073,8 +2056,9 @@ inserted:
else {
u32 ref;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -2137,8 +2121,9 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal, block;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
block = ext4_new_meta_blocks(handle, inode, goal, 0,
@@ -2614,6 +2599,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
.in_inode = !!entry->e_value_inum,
};
struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
+ int needs_kvfree = 0;
int error;
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
@@ -2636,7 +2622,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
error = -ENOMEM;
goto out;
}
-
+ needs_kvfree = 1;
error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
if (error)
goto out;
@@ -2675,7 +2661,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
out:
kfree(b_entry_name);
- if (entry->e_value_inum && buffer)
+ if (needs_kvfree && buffer)
kvfree(buffer);
if (is)
brelse(is->iloc.bh);