diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/balloc.c | 167 | ||||
-rw-r--r-- | fs/ext4/bitmap.c | 13 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 153 | ||||
-rw-r--r-- | fs/ext4/extents.c | 38 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 30 | ||||
-rw-r--r-- | fs/ext4/file.c | 3 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 11 | ||||
-rw-r--r-- | fs/ext4/hash.c | 6 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 26 | ||||
-rw-r--r-- | fs/ext4/inline.c | 199 | ||||
-rw-r--r-- | fs/ext4/inode.c | 841 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 761 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 17 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 11 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 39 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 41 | ||||
-rw-r--r-- | fs/ext4/namei.c | 53 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 116 | ||||
-rw-r--r-- | fs/ext4/readpage.c | 72 | ||||
-rw-r--r-- | fs/ext4/resize.c | 7 | ||||
-rw-r--r-- | fs/ext4/super.c | 521 | ||||
-rw-r--r-- | fs/ext4/sysfs.c | 2 | ||||
-rw-r--r-- | fs/ext4/verity.c | 32 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 30 |
24 files changed, 1615 insertions, 1574 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8ff4b9192a9f..c1edde817be8 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -80,32 +80,56 @@ static inline int ext4_block_in_group(struct super_block *sb, return (actual_group == block_group) ? 1 : 0; } -/* Return the number of clusters used for file system metadata; this +/* + * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - unsigned num_clusters; - int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + unsigned base_clusters, num_clusters; + int block_cluster = -1, inode_cluster; + int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); - ext4_fsblk_t itbl_blk; + ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; + ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ - num_clusters = ext4_num_base_meta_clusters(sb, block_group); + base_clusters = ext4_num_base_meta_clusters(sb, block_group); + num_clusters = base_clusters; + + /* + * Account and record inode table clusters if any cluster + * is in the block group, or inode table cluster range is + * [-1, -1] and won't overlap with block/inode bitmap cluster + * accounted below. + */ + itbl_blk_start = ext4_inode_table(sb, gdp); + itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; + if (itbl_blk_start <= end && itbl_blk_end >= start) { + itbl_blk_start = itbl_blk_start >= start ? + itbl_blk_start : start; + itbl_blk_end = itbl_blk_end <= end ? + itbl_blk_end : end; + + itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); + itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); + + num_clusters += itbl_cluster_end - itbl_cluster_start + 1; + /* check if border cluster is overlapped */ + if (itbl_cluster_start == base_clusters - 1) + num_clusters--; + } /* - * For the allocation bitmaps and inode table, we first need - * to check to see if the block is in the block group. If it - * is, then check to see if the cluster is already accounted - * for in the clusters used for the base metadata cluster, or - * if we can increment the base metadata cluster to include - * that block. Otherwise, we will have to track the cluster - * used for the allocation bitmap or inode table explicitly. + * For the allocation bitmaps, we first need to check to see + * if the block is in the block group. If it is, then check + * to see if the cluster is already accounted for in the clusters + * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. @@ -113,46 +137,26 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); - if (block_cluster < num_clusters) - block_cluster = -1; - else if (block_cluster == num_clusters) { + if (block_cluster >= base_clusters && + (block_cluster < itbl_cluster_start || + block_cluster > itbl_cluster_end)) num_clusters++; - block_cluster = -1; - } } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); - if (inode_cluster < num_clusters) - inode_cluster = -1; - else if (inode_cluster == num_clusters) { + /* + * Additional check if inode bitmap is in just accounted + * block_cluster + */ + if (inode_cluster != block_cluster && + inode_cluster >= base_clusters && + (inode_cluster < itbl_cluster_start || + inode_cluster > itbl_cluster_end)) num_clusters++; - inode_cluster = -1; - } } - itbl_blk = ext4_inode_table(sb, gdp); - for (i = 0; i < sbi->s_itb_per_group; i++) { - if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, itbl_blk + i - start); - if ((c < num_clusters) || (c == inode_cluster) || - (c == block_cluster) || (c == itbl_cluster)) - continue; - if (c == num_clusters) { - num_clusters++; - continue; - } - num_clusters++; - itbl_cluster = c; - } - } - - if (block_cluster != -1) - num_clusters++; - if (inode_cluster != -1) - num_clusters++; - return num_clusters; } @@ -187,8 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, ASSERT(buffer_locked(bh)); - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | @@ -303,6 +305,38 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, return desc; } +static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t next_zero_bit; + unsigned long bitmap_size = sb->s_blocksize * 8; + unsigned int offset = num_clusters_in_group(sb, block_group); + + if (bitmap_size <= offset) + return 0; + + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); + + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); +} + +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + + if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) { + ext4_error(sb, "invalid group %u", group); + return NULL; + } + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. @@ -350,13 +384,13 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - EXT4_B2C(sbi, offset + sbi->s_itb_per_group), + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < - EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; @@ -377,14 +411,13 @@ static int ext4_validate_block_bitmap(struct super_block *sb, if (buffer_verified(bh)) return 0; - if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; - if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh) || + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); @@ -401,6 +434,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } + blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); @@ -474,17 +516,19 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); - set_bitmap_uptodate(bh); - set_buffer_uptodate(bh); - set_buffer_verified(bh); - ext4_unlock_group(sb, block_group); - unlock_buffer(bh); if (err) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } - goto verify; + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { @@ -842,10 +886,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, if (!ext4_bg_has_super(sb, group)) return 0; - if (ext4_has_feature_meta_bg(sb)) - return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); - else - return EXT4_SB(sb)->s_gdb_count; + return EXT4_SB(sb)->s_gdb_count; } /** @@ -887,11 +928,11 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { - num += ext4_bg_num_gdb(sb, block_group); + num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ - num += ext4_bg_num_gdb(sb, block_group); + num += ext4_bg_num_gdb_meta(sb, block_group); } return EXT4_NUM_B2C(sbi, num); } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index f63e028c638c..cd725bebe69e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -16,7 +16,7 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars) return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } -int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { @@ -38,7 +38,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, return provided == calculated; } -void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { @@ -54,7 +54,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } -int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { @@ -74,13 +74,10 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, } else calculated &= 0xFFFF; - if (provided == calculated) - return 1; - - return 0; + return provided == calculated; } -void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 08b29c289da4..6948d673bba2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -40,6 +40,7 @@ #ifdef __KERNEL__ #include <linux/compat.h> #endif +#include <uapi/linux/ext4.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> @@ -591,17 +592,6 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(RESERVED); } -/* Used to pass group descriptor data when online resize is done */ -struct ext4_new_group_input { - __u32 group; /* Group number for this data */ - __u64 block_bitmap; /* Absolute block number of block bitmap */ - __u64 inode_bitmap; /* Absolute block number of inode bitmap */ - __u64 inode_table; /* Absolute block number of inode table start */ - __u32 blocks_count; /* Total number of blocks in this group */ - __u16 reserved_blocks; /* Number of reserved blocks in this group */ - __u16 unused; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; @@ -698,70 +688,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 -/* - * ioctl commands - */ -#define EXT4_IOC_GETVERSION _IOR('f', 3, long) -#define EXT4_IOC_SETVERSION _IOW('f', 4, long) -#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION -#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) -#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) -#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) -#define EXT4_IOC_MIGRATE _IO('f', 9) - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ - /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ -#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) -#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) -#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) -#define EXT4_IOC_SWAP_BOOT _IO('f', 17) -#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -/* ioctl codes 19--39 are reserved for fscrypt */ -#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) -#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) -#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) -#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) -#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) -#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) - -#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) - -/* - * Flags for going down operation - */ -#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ -#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -/* - * Flags returned by EXT4_IOC_GETSTATE - * - * We only expose to userspace a subset of the state flags in - * i_state_flags - */ -#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 -#define EXT4_STATE_FLAG_NEW 0x00000002 -#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 -#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 - -/* flags for ioctl EXT4_IOC_CHECKPOINT */ -#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 -#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 -#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 -#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ - EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ - EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) - -/* - * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID - */ -struct fsuuid { - __u32 fsu_len; - __u32 fsu_flags; - __u8 fsu_uuid[]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -776,12 +702,6 @@ struct fsuuid { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif -/* - * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. - * It indicates that the entry in extent status cache is for a hole. - */ -#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 - /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -852,15 +772,6 @@ struct ext4_inode { __le32 i_projid; /* Project ID */ }; -struct move_extent { - __u32 reserved; /* should be zero */ - __u32 donor_fd; /* donor file descriptor */ - __u64 orig_start; /* logical start offset in block for orig */ - __u64 donor_start; /* logical start offset in block for donor */ - __u64 len; /* block length to be moved */ - __u64 moved_len; /* moved block length */ -}; - #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) @@ -1120,8 +1031,8 @@ struct ext4_inode_info { /* mballoc */ atomic_t i_prealloc_active; - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; + struct rb_root i_prealloc_node; + rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; @@ -1613,7 +1524,6 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_mb_max_inode_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; @@ -1774,6 +1684,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } +static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -1887,7 +1821,6 @@ static inline void ext4_simulate_fail_bh(struct super_block *sb, * Inode dynamic state flags */ enum { - EXT4_STATE_JDATA, /* journaled data exists */ EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ @@ -2676,16 +2609,16 @@ struct mmpd_data { /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); -void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); -int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); @@ -2716,6 +2649,8 @@ extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); +extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, @@ -3323,19 +3258,6 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) raw_inode->i_size_high = cpu_to_le32(i_size >> 32); } -static inline -struct ext4_group_info *ext4_get_group_info(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info **grp_info; - long indexv, indexh; - BUG_ON(group >= EXT4_SB(sb)->s_groups_count); - indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); - indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); - grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); - return grp_info[indexh]; -} - /* * Reading s_groups_count requires using smp_rmb() afterwards. See * the locking protocol documented in the comments of ext4_group_add() @@ -3550,7 +3472,7 @@ extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); -extern int ext4_readpage_inline(struct inode *inode, struct page *page); +int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, @@ -3647,7 +3569,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page); + struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); @@ -3757,9 +3679,8 @@ extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); -extern int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3559ea6b0781..35703dce23a3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4526,13 +4526,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - /* Call ext4_force_commit to flush all data in case of data=journal. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - /* * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to @@ -4616,6 +4609,20 @@ static long ext4_zero_range(struct file *file, loff_t offset, filemap_invalidate_unlock(mapping); goto out_mutex; } + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on + * disk in case of crash before zeroing trans is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) { + filemap_invalidate_unlock(mapping); + goto out_mutex; + } + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = current_time(inode); @@ -5290,13 +5297,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - /* Call ext4_force_commit to flush all data in case of data=journal. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case @@ -5443,13 +5443,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - /* Call ext4_force_commit to flush all data in case of data=journal */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { @@ -5802,7 +5795,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) * mapped - no physical clusters have been allocated, and the * file has no extents */ - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || + ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 7bc221038c6c..595abb9e7d74 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -267,14 +267,12 @@ static void __es_find_extent_range(struct inode *inode, /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %x\n", - lblk, es1->es_lblk, es1->es_len, - ext4_es_pblock(es1), ext4_es_status(es1)); - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; } es1 = __es_tree_search(&tree->root, lblk); @@ -293,7 +291,7 @@ out: } if (es1 && matching_fn(es1)) { - tree->cache_es = es1; + WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; @@ -931,14 +929,12 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u)\n", - lblk, es1->es_lblk, es1->es_len); - found = 1; - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; } node = tree->root.rb_node; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0b8b4499e5ca..d101b3b0c7da 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | + FMODE_DIO_PARALLEL_WRITE; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 027a7d7037a0..f65fdb27ce14 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -153,23 +153,12 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; /* - * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. * Metadata is in the journal, we wait for proper transaction to * commit here. - * - * data=journal: - * filemap_fdatawrite won't do anything (the buffers are clean). - * ext4_force_commit will write the file data into the journal and - * will wait on that. - * filemap_fdatawait() will encounter a ton of newly-dirtied pages - * (they were dirtied by commit). But that's OK - the blocks are - * safe in-journal, which is all fsync() needs to ensure. */ if (!sbi->s_journal) ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); - else if (ext4_should_journal_data(inode)) - ret = ext4_force_commit(inode->i_sb); else ret = ext4_fsync_journal(inode, datasync, &needs_barrier); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 147b5241dd94..46c3423ddfa1 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -277,7 +277,11 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, } default: hinfo->hash = 0; - return -1; + hinfo->minor_hash = 0; + ext4_warning(dir->i_sb, + "invalid/unsupported hash tree version %u", + hinfo->hash_version); + return -EINVAL; } hash = hash & ~1; if (hash == (EXT4_HTREE_EOF_32BIT << 1)) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 157663031f8c..754f961cd9fd 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -91,14 +91,14 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (buffer_verified(bh)) return 0; - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); - if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); @@ -293,7 +293,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, block_group); - if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { fatal = -EFSCORRUPTED; goto error_return; } @@ -327,7 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -813,8 +813,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -852,7 +851,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1047,7 +1046,7 @@ got_group: * Skip groups with already-known suspicious inode * tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; } @@ -1165,8 +1164,7 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -1185,6 +1183,10 @@ got: if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); + if (!grp) { + err = -EFSCORRUPTED; + goto out; + } down_read(&grp->alloc_sem); /* * protect vs itable * lazyinit @@ -1222,7 +1224,7 @@ got: } } if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1528,7 +1530,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp) + if (!gdp || !grp) goto out; /* diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1602d74b5eeb..5854bd5a3352 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -34,6 +34,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; + void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) @@ -57,14 +58,23 @@ static int get_max_inline_xattr_value_size(struct inode *inode, raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + while (!IS_LAST_ENTRY(entry)) { + void *next = EXT4_XATTR_NEXT(entry); + + if (next >= end) { + EXT4_ERROR_INODE(inode, + "corrupt xattr in inline inode"); + return 0; + } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } + entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); @@ -350,7 +360,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); - if (error == -ENODATA) + if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); @@ -467,16 +477,16 @@ out: return error; } -static int ext4_read_inline_page(struct inode *inode, struct page *page) +static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!ext4_has_inline_data(inode)); - BUG_ON(page->index); + BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", @@ -489,19 +499,20 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); - kaddr = kmap_atomic(page); + BUG_ON(len > PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_page(page); - kunmap_atomic(kaddr); - zero_user_segment(page, len, PAGE_SIZE); - SetPageUptodate(page); + flush_dcache_folio(folio); + kunmap_local(kaddr); + folio_zero_segment(folio, len, folio_size(folio)); + folio_mark_uptodate(folio); brelse(iloc.bh); out: return ret; } -int ext4_readpage_inline(struct inode *inode, struct page *page) +int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; @@ -515,16 +526,16 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ - if (!page->index) - ret = ext4_read_inline_page(inode, page); - else if (!PageUptodate(page)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!folio->index) + ret = ext4_read_inline_folio(inode, folio); + else if (!folio_test_uptodate(folio)) { + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); + folio_unlock(folio); return ret >= 0 ? 0 : ret; } @@ -534,8 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; - struct page *page = NULL; - unsigned int flags; + struct folio *folio = NULL; unsigned from, to; struct ext4_iloc iloc; @@ -564,12 +574,11 @@ retry: /* We cannot recurse into the filesystem as the transaction is already * started */ - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { - ret = -ENOMEM; - goto out; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_nofolio; } ext4_write_lock_xattr(inode, &no_expand); @@ -582,8 +591,8 @@ retry: from = 0; to = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } @@ -593,21 +602,21 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(page, from, to, + ret = __block_write_begin(&folio->page, from, to, ext4_get_block_unwritten); } else - ret = __block_write_begin(page, from, to, ext4_get_block); + ret = __block_write_begin(&folio->page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), - from, to, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; @@ -627,13 +636,14 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - if (page) - block_commit_write(page, from, to); + if (folio) + block_commit_write(&folio->page, from, to); out: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } +out_nofolio: if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) @@ -655,8 +665,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, { int ret; handle_t *handle; - unsigned int flags; - struct page *page; + struct folio *folio; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) @@ -693,28 +702,27 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (ret) goto out; - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out; } - *pagep = page; + *pagep = &folio->page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out_up_read; } - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out_up_read; } } @@ -735,20 +743,21 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ext4_std_error(inode->i_sb, ret); goto out; } @@ -762,30 +771,30 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, */ (void) ext4_find_inline_data_nolock(inode); - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); + kunmap_local(kaddr); + folio_mark_uptodate(folio); + /* clear dirty flag so that writepages wouldn't work for us. */ + folio_clear_dirty(folio); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* - * It's important to update i_size while still holding page + * It's important to update i_size while still holding folio * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (likely(copied)) @@ -852,11 +861,12 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, void **fsdata) { int ret = 0, inline_size; - struct page *page; + struct folio *folio; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { @@ -866,32 +876,32 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, inline_size = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } - ret = __block_write_begin(page, 0, inline_size, + ret = __block_write_begin(&folio->page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ext4_truncate_failed_write(inode); return ret; } - SetPageDirty(page); - SetPageUptodate(page); + folio_mark_dirty(folio); + folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } return ret; } @@ -912,10 +922,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, { int ret; handle_t *handle; - struct page *page; + struct folio *folio; struct ext4_iloc iloc; int retries = 0; - unsigned int flags; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -947,11 +956,10 @@ retry_journal: * We cannot recurse into the filesystem as the transaction * is already started. */ - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out_journal; } @@ -961,8 +969,8 @@ retry_journal: goto out_release_page; } - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out_release_page; } @@ -972,13 +980,13 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = page; + *pagep = &folio->page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_journal: ext4_journal_stop(handle); out: @@ -1177,6 +1185,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); + unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; @@ -1251,6 +1260,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); + unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { @@ -1258,7 +1268,6 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, buf, inline_size); } - unlock_buffer(data_bh); out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..ce5f21b6c2b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -180,33 +179,6 @@ void ext4_evict_inode(struct inode *inode) if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { - /* - * When journalling data dirty buffers are tracked only in the - * journal. So although mm thinks everything is clean and - * ready for reaping the inode might still have some pages to - * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidate_folio() - * (via truncate_inode_pages()) to discard these buffers can - * cause data loss. Also even if we did not discard these - * buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to - * read them before the transaction is checkpointed. So be - * careful and force everything to disk here... We use - * ei->i_datasync_tid to store the newest transaction - * containing inode's data. - * - * Note that directories do not have this problem because they - * don't use page cache. - */ - if (inode->i_ino != EXT4_JOURNAL_INO && - ext4_should_journal_data(inode) && - S_ISREG(inode->i_mode) && inode->i_data.nrpages) { - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - - jbd2_complete_transaction(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } truncate_inode_pages_final(&inode->i_data); goto no_delete; @@ -1005,29 +977,17 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, } /* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write(). So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage(). In that case, we - * *know* that ext4_writepage() has generated enough buffer credits to do the - * whole page. So we won't block on the journal in that case, which is good, - * because the caller may be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. + * Helper for handling dirtying of journalled data. We also mark the folio as + * dirty so that writeback code knows about this page (and inode) contains + * dirty data. ext4_writepages() then commits appropriate transaction to + * make data stable. */ +static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) +{ + folio_mark_dirty(bh->b_folio); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1050,17 +1010,17 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); return ret; } #ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, +static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; @@ -1070,22 +1030,24 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, int nr_wait = 0; int i; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } bbits = ilog2(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; @@ -1098,19 +1060,20 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, to, block_end, - block_start, from); + folio_zero_segments(folio, to, + block_end, + block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } @@ -1130,14 +1093,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), - blocksize, - bh_offset(wait[i])); + err2 = fscrypt_decrypt_pagecache_blocks(folio, + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; @@ -1149,6 +1111,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } #endif +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) @@ -1157,7 +1126,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, int ret, needed_blocks; handle_t *handle; int retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; unsigned from, to; @@ -1184,68 +1153,68 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page + * __filemap_get_folio() can take a long time if the + * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. + * the folio (if needed) without using GFP_NOFS. */ retry_grab: - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ - if (!page_has_buffers(page)) - create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + if (!folio_buffers(folio)) + create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); - unlock_page(page); + folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - put_page(page); + folio_put(folio); return PTR_ERR(handle); } - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(page, pos, len, - ext4_get_block); + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(page, pos, len, ext4_get_block); + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, - page_buffers(page), from, to, NULL, - do_journal_get_write_access); + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); - unlock_page(page); + folio_unlock(folio); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -1273,10 +1242,10 @@ retry_journal: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; - put_page(page); + folio_put(folio); return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } @@ -1288,7 +1257,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; @@ -1306,6 +1275,7 @@ static int ext4_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1321,7 +1291,7 @@ static int ext4_write_end(struct file *file, copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* - * it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree @@ -1329,15 +1299,15 @@ static int ext4_write_end(struct file *file, */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) @@ -1371,28 +1341,28 @@ static int ext4_write_end(struct file *file, /* * This is a private version of page_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need - * to call ext4_handle_dirty_metadata() instead. + * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, - struct page *page, + struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; - zero_user(page, start, size); + folio_zero_range(folio, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); @@ -1408,6 +1378,7 @@ static int ext4_journalled_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1426,25 +1397,26 @@ static int ext4_journalled_write_end(struct file *file, if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); - if (unlikely(copied < len) && !PageUptodate(page)) { + if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, inode, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, inode, page, + ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); - ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); @@ -1568,6 +1540,7 @@ struct mpage_da_data { struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; + unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -1649,12 +1622,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} - /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block @@ -1887,249 +1854,41 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int __ext4_journalled_writepage(struct page *page, - unsigned int len) +static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - handle_t *handle = NULL; - int ret = 0, err = 0; - int inline_data = ext4_has_inline_data(inode); - struct buffer_head *inode_bh = NULL; - loff_t size; - - ClearPageChecked(page); - - if (inline_data) { - BUG_ON(page->index != 0); - BUG_ON(len > ext4_get_max_inline_size(inode)); - inode_bh = ext4_journalled_write_inline_data(inode, len, page); - if (inode_bh == NULL) - goto out; - } - /* - * We need to release the page lock before we start the - * journal, so grab a reference so the page won't disappear - * out from under us. - */ - get_page(page); - unlock_page(page); - - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - put_page(page); - goto out_no_pagelock; - } - BUG_ON(!ext4_handle_valid(handle)); - - lock_page(page); - put_page(page); - size = i_size_read(inode); - if (page->mapping != mapping || page_offset(page) > size) { - /* The page got truncated from under us */ - ext4_journal_stop(handle); - ret = 0; - goto out; - } - - if (inline_data) { - ret = ext4_mark_inode_dirty(handle, inode); - } else { - struct buffer_head *page_bufs = page_buffers(page); - - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, do_journal_get_write_access); - - err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, write_end_fn); - } - if (ret == 0) - ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: - unlock_page(page); -out_no_pagelock: - brelse(inode_bh); - return ret; + mpd->first_page += folio_nr_pages(folio); + folio_unlock(folio); } -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - * - ext4_writepages after taking page lock (have journal handle) - * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via the kswapd/direct reclaim (no journal handle) - * - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, - struct writeback_control *wbc) +static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { - struct folio *folio = page_folio(page); - int ret = 0; - loff_t size; - unsigned int len; - struct buffer_head *page_bufs = NULL; - struct inode *inode = page->mapping->host; - struct ext4_io_submit io_submit; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - folio_invalidate(folio, 0, folio_size(folio)); - folio_unlock(folio); - return -EIO; - } - - trace_ext4_writepage(page); - size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT && - !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - /* Should never happen but for bugs in other kernel subsystems */ - if (!page_has_buffers(page)) { - ext4_warning_inode(inode, - "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); - return 0; - } - - page_bufs = page_buffers(page); - /* - * We cannot do block allocation or other extent handling in this - * function. If there are buffers needing that, we have to redirty - * the page. But we may reach here when we do a journal commit via - * journal_submit_inode_data_buffers() and in that case we must write - * allocated buffers to achieve data=ordered mode guarantees. - * - * Also, if there is only one buffer per page (the fs block - * size == the page size), if one buffer needs block - * allocation or needs to modify the extent tree to clear the - * unwritten flag, we know that the page can't be written at - * all, so we might as well refuse the write immediately. - * Unfortunately if the block size != page size, we can't as - * easily detect this case using ext4_walk_page_buffers(), but - * for the extremely common case, this is an optimization that - * skips a useless round trip through ext4_bio_write_page(). - */ - if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - if ((current->flags & PF_MEMALLOC) || - (inode->i_sb->s_blocksize == PAGE_SIZE)) { - /* - * For memory cleaning there's no point in writing only - * some buffers. So just bail out. Warn if we came here - * from direct reclaim. - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) - == PF_MEMALLOC); - unlock_page(page); - return 0; - } - } - - if (PageChecked(page) && ext4_should_journal_data(inode)) - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - return __ext4_journalled_writepage(page, len); - - ext4_io_submit_init(&io_submit, wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - ret = ext4_bio_write_page(&io_submit, page, len); - ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); - return ret; -} - -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -{ - int len; + size_t len; loff_t size; int err; - BUG_ON(page->index != mpd->first_page); - clear_page_dirty_for_io(page); + BUG_ON(folio->index != mpd->first_page); + folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing - * data through mmap while writeback runs. clear_page_dirty_for_io() + * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get - * written to again until we release page lock. So only after - * clear_page_dirty_for_io() we are safe to sample i_size for - * ext4_bio_write_page() to zero-out tail of the written page. We rely - * on the barrier provided by TestClearPageDirty in - * clear_page_dirty_for_io() to make sure i_size is really sampled only + * written to again until we release folio lock. So only after + * folio_clear_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_folio() to zero-out tail of the written page. We rely + * on the barrier provided by folio_test_clear_dirty() in + * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT && + len = folio_size(folio); + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len); + err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; - mpd->first_page++; return err; } @@ -2240,9 +1999,10 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, head->b_page); + err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; + mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2252,21 +2012,22 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* - * mpage_process_page - update page buffers corresponding to changed extent and - * may submit fully mapped page for IO - * - * @mpd - description of extent to map, on return next extent to map - * @m_lblk - logical block mapping. - * @m_pblk - corresponding physical mapping. - * @map_bh - determines on return whether this page requires any further + * mpage_process_folio - update folio buffers corresponding to changed extent + * and may submit fully mapped page for IO + * @mpd: description of extent to map, on return next extent to map + * @folio: Contains these buffers. + * @m_lblk: logical block mapping. + * @m_pblk: corresponding physical mapping. + * @map_bh: determines on return whether this page requires any further * mapping or not. - * Scan given page buffers corresponding to changed extent and update buffer + * + * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. - * If the given page is not fully mapped, we update @map to the next extent in - * the given page that needs mapping & return @map_bh as true. + * If the given folio is not fully mapped, we update @mpd to the next extent in + * the given folio that needs mapping & return @map_bh as true. */ -static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, +static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { @@ -2279,14 +2040,14 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. - * Find next buffer in the page to map. + * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; @@ -2359,9 +2120,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (nr == 0) break; for (i = 0; i < nr; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - err = mpage_process_page(mpd, page, &lblk, &pblock, + err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh @@ -2371,9 +2132,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ - err = mpage_submit_page(mpd, page); + err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; + mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } @@ -2559,17 +2321,45 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -/* Return true if the page needs to be written as part of transaction commit */ -static bool ext4_page_nomap_can_writeout(struct page *page) +static int ext4_journal_page_buffers(handle_t *handle, struct page *page, + int len) { - struct buffer_head *bh, *head; + struct buffer_head *page_bufs = page_buffers(page); + struct inode *inode = page->mapping->host; + int ret, err; - bh = head = page_buffers(page); - do { - if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) - return true; - } while ((bh = bh->b_this_page) != head); - return false; + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, + struct mpage_da_data *mpd, + struct page *page) +{ + struct inode *inode = mpd->inode; + loff_t size = i_size_read(inode); + int len; + + ClearPageChecked(page); + mpd->wbc->nr_to_write--; + + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + + return ext4_journal_page_buffers(handle, page, len); } /* @@ -2597,7 +2387,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; @@ -2605,14 +2394,23 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; + handle_t *handle = NULL; + int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - folio_batch_init(&fbatch); + mpd->map.m_len = 0; mpd->next_page = index; + if (ext4_should_journal_data(mpd->inode)) { + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, + bpp); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); @@ -2630,13 +2428,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ - if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->wbc->nr_to_write <= + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; + if (handle) { + err = ext4_journal_ensure_credits(handle, bpp, + 0); + if (err < 0) + goto out; + } + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no @@ -2676,18 +2483,28 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->first_page = folio->index; mpd->next_page = folio->index + folio_nr_pages(folio); /* - * Writeout for transaction commit where we cannot - * modify metadata is simple. Just submit the page. + * Writeout when we cannot modify metadata is simple. + * Just submit the page. For data=journal mode we + * first handle writeout of the page for checkpoint and + * only after that handle delayed page dirtying. This + * makes sure current data is checkpointed to the final + * location before possibly journalling it again which + * is desirable when the page is frequently dirtied + * through a pin. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) { - err = mpage_submit_page(mpd, &folio->page); + err = mpage_submit_folio(mpd, folio); + if (err < 0) + goto out; + /* Pending dirtying of journalled data? */ + if (folio_test_checked(folio)) { + err = mpage_journal_page_buffers(handle, + mpd, &folio->page); if (err < 0) goto out; - } else { - folio_unlock(folio); - mpd->first_page += folio_nr_pages(folio); + mpd->journalled_more_data = 1; } + mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << @@ -2699,24 +2516,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; err = 0; } - left -= folio_nr_pages(folio); } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; + if (handle) + ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); + if (handle) + ext4_journal_stop(handle); return err; } -static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, - void *data) -{ - return ext4_writepage(&folio->page, wbc); -} - static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; @@ -2742,13 +2556,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; - if (ext4_should_journal_data(inode)) { - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); - blk_finish_plug(&plug); - goto out_writepages; - } - /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that @@ -2783,6 +2590,26 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) ext4_journal_stop(handle); } + /* + * data=journal mode does not do delalloc so we just need to writeout / + * journal already mapped buffers. On the other hand we need to commit + * transaction to make data stable. We expect all the data to be + * already in the journal (the only exception are DMA pinned pages + * dirtied behind our back) so we commit transaction here and run the + * writeback loop to checkpoint them. The checkpointing is not actually + * necessary to make data persistent *but* quite a few places (extent + * shifting operations, fsverity, ...) depend on being able to drop + * pagecache pages after calling filemap_write_and_wait() and for that + * checkpointing needs to happen. + */ + if (ext4_should_journal_data(inode)) { + mpd->can_map = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + ext4_fc_commit(sbi->s_journal, + EXT4_I(inode)->i_datasync_tid); + } + mpd->journalled_more_data = 0; + if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in @@ -2956,13 +2783,21 @@ static int ext4_writepages(struct address_space *mapping, .can_map = 1, }; int ret; + int alloc_ctx; if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return -EIO; - percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); - percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); + /* + * For data=journal writeback we could have come across pages marked + * for delayed dirtying (PageChecked) which were just added to the + * running transaction. Try once more to get them to stable storage. + */ + if (!ret && mpd.journalled_more_data) + ret = ext4_do_writepages(&mpd); + ext4_writepages_up_read(sb, alloc_ctx); return ret; } @@ -2990,17 +2825,18 @@ static int ext4_dax_writepages(struct address_space *mapping, long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + int alloc_ctx; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } @@ -3043,7 +2879,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret, retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; @@ -3070,22 +2906,22 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(page, pos, len, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -3100,7 +2936,7 @@ retry: return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } @@ -3148,6 +2984,9 @@ static int ext4_da_write_end(struct file *file, ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); + if (unlikely(copied < len) && !PageUptodate(page)) + copied = 0; + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3159,9 +2998,8 @@ static int ext4_da_write_end(struct file *file, * i_disksize since writeback will push i_disksize upto i_size * eventually. If the end of the current write is > i_size and * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as neither - * ext4_writepage() nor certain ext4_writepages() paths not - * allocating blocks update i_disksize. + * check), we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks update i_disksize. * * Note that we defer inode dirtying to generic_write_end() / * ext4_da_write_inline_data_end(). @@ -3235,9 +3073,7 @@ int ext4_alloc_da_blocks(struct inode *inode) static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - journal_t *journal; sector_t ret = 0; - int err; inode_lock_shared(inode); /* @@ -3247,45 +3083,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { + (test_opt(inode->i_sb, DELALLOC) || + ext4_should_journal_data(inode))) { /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file + * With delalloc or journalled data we want to sync the file so + * that we can make sure we allocate blocks for file and data + * is in place for the user to see it */ filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && - ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT4_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext4_clear_inode_state(inode, EXT4_STATE_JDATA); - journal = EXT4_JOURNAL(inode); - jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal, 0); - jbd2_journal_unlock_updates(journal); - - if (err) - goto out; - } - ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: @@ -3295,17 +3102,16 @@ out: static int ext4_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; int ret = -EAGAIN; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; - trace_ext4_readpage(page); + trace_ext4_readpage(&folio->page); if (ext4_has_inline_data(inode)) - ret = ext4_readpage_inline(inode, page); + ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) - return ext4_mpage_readpages(inode, NULL, page); + return ext4_mpage_readpages(inode, NULL, folio); return ret; } @@ -3571,7 +3377,7 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); - WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } @@ -3686,24 +3492,26 @@ const struct iomap_ops ext4_iomap_report_ops = { }; /* - * Whenever the folio is being dirtied, corresponding buffers should already - * be attached to the transaction (we take care of this in ext4_page_mkwrite() - * and ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->dirty_folio is called under VFS locks and the folio - * is not necessarily locked. - * - * We cannot just dirty the folio and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the folio "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering. We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive". We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode. So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); - folio_set_checked(folio); + if (folio_maybe_dma_pinned(folio)) + folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } @@ -3809,23 +3617,26 @@ static int __ext4_block_zero_page_range(handle_t *handle, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; - struct page *page; + struct folio *folio; int err = 0; - page = find_or_create_page(mapping, from >> PAGE_SHIFT, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(folio)) + return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + bh = folio_buffers(folio); + if (!bh) { + create_empty_buffers(&folio->page, blocksize, 0); + bh = folio_buffers(folio); + } /* Find the buffer that contains "offset" */ - bh = page_buffers(page); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3847,7 +3658,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, } /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { @@ -3857,7 +3668,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { @@ -3873,11 +3684,11 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (err) goto unlock; } - zero_user(page, offset, length); + folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); @@ -3887,8 +3698,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -5385,7 +5196,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty folio without + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. @@ -5395,7 +5206,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!folio) + if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); @@ -6119,7 +5930,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int alloc_ctx; /* * We have to be very careful here: changing a data block's @@ -6157,7 +5968,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* @@ -6174,7 +5985,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -6182,7 +5993,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); @@ -6212,7 +6023,7 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; @@ -6256,19 +6067,18 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) goto out_ret; } - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { - unlock_page(page); + if (folio->mapping != mapping || folio_pos(folio) > size) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time @@ -6276,17 +6086,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ - if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), + if (folio_buffers(folio)) { + if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_for_stable_page(page); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } - unlock_page(page); + folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; @@ -6307,36 +6117,25 @@ retry_alloc: if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { + if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); - err = __block_write_begin(page, 0, len, ext4_get_block); + err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - do_journal_get_write_access)) + if (ext4_journal_page_buffers(handle, &folio->page, len)) goto out_error; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - write_end_fn)) - goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, - page_offset(page), len)) - goto out_error; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { - unlock_page(page); + folio_unlock(folio); } } ext4_journal_stop(handle); @@ -6349,7 +6148,7 @@ out: sb_end_pagefault(inode->i_sb); return ret; out_error: - unlock_page(page); + folio_unlock(folio); ext4_journal_stop(handle); goto out; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b2ae37a8b80..7b2e36d103cb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -745,6 +745,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); + if (!grp) + return NULL; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -1060,9 +1062,9 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, - void *buddy, void *bitmap, ext4_group_t group) + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) { - struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; @@ -1168,10 +1170,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); - if (bh == NULL) { - err = -ENOMEM; - goto out; - } + if (bh == NULL) + return -ENOMEM; } else bh = &bhs; @@ -1183,6 +1183,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) break; grinfo = ext4_get_group_info(sb, group); + if (!grinfo) + continue; /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so @@ -1248,6 +1250,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); + if (!grinfo) { + err = -EFSCORRUPTED; + goto out; + } grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * @@ -1258,7 +1264,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); - ext4_mb_generate_buddy(sb, data, incore, group); + ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { @@ -1372,6 +1378,9 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); + if (!this_grp) + return -EFSCORRUPTED; + /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already @@ -1446,6 +1455,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); + if (!grp) + return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; @@ -1489,7 +1500,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { - BUG_ON(page->mapping != inode->i_mapping); + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, + "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + /* should never happen */ + unlock_page(page); + ret = -EINVAL; + goto err; + } if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { @@ -1525,7 +1542,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { - BUG_ON(page->mapping != inode->i_mapping); + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, + "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + /* should never happen */ + unlock_page(page); + ret = -EINVAL; + goto err; + } if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, e4b->bd_bitmap, gfp); @@ -1557,8 +1580,7 @@ err: put_page(page); if (e4b->bd_bitmap_page) put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; @@ -1721,7 +1743,8 @@ static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) break; order++; - if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + buddy2 = mb_find_buddy(e4b, order, &max); + if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; @@ -2021,8 +2044,6 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; - struct ext4_free_extent ex; - int max; if (ac->ac_status == AC_STATUS_FOUND) return; @@ -2041,17 +2062,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, if (bex->fe_len < gex->fe_len) return; - if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) - && bex->fe_group == e4b->bd_group) { - /* recheck chunk's availability - we don't know - * when it was found (within this lock-unlock - * period or not) */ - max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); - if (max >= gex->fe_len) { - ext4_mb_use_best_found(ac, e4b); - return; - } - } + if (finish_group) + ext4_mb_use_best_found(ac, e4b); } /* @@ -2124,7 +2136,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, } static noinline_for_stack -int ext4_mb_try_best_found(struct ext4_allocation_context *ac, +void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; @@ -2135,7 +2147,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) - return err; + return; ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); @@ -2147,8 +2159,6 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); - - return 0; } static noinline_for_stack @@ -2162,7 +2172,9 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; - if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + if (!grp) + return -EFSCORRUPTED; + if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; @@ -2236,7 +2248,9 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, continue; buddy = mb_find_buddy(e4b, i, &max); - BUG_ON(buddy == NULL); + if (WARN_RATELIMIT(buddy == NULL, + "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) + continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { @@ -2386,7 +2400,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; free = grp->bb_free; @@ -2455,6 +2469,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_grpblk_t free; int ret = 0; + if (!grp) + return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { @@ -2535,7 +2551,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, * prefetch once, so we avoid getblk() call, which can * be expensive. */ - if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 && !(ext4_has_group_desc_csum(sb) && @@ -2569,17 +2585,17 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { - while (nr-- > 0) { - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, - NULL); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp; + struct ext4_group_info *grp; + while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; + gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); - if (EXT4_MB_GRP_NEED_INIT(grp) && + if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { @@ -2838,6 +2854,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); + if (!grinfo) + return 0; /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); @@ -2848,7 +2866,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) buddy_loaded = 1; } - memcpy(&sg, ext4_get_group_info(sb, group), i); + memcpy(&sg, grinfo, i); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); @@ -3084,7 +3102,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); - goto exit_meta_group_info; + return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; @@ -3138,7 +3156,6 @@ exit_group_info: group_info[idx] = NULL; rcu_read_unlock(); } -exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ @@ -3210,8 +3227,12 @@ static int ext4_mb_init_backend(struct super_block *sb) err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); - while (i-- > 0) - kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + while (i-- > 0) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + + if (grp) + kmem_cache_free(cachep, grp); + } i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); @@ -3419,7 +3440,6 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -3525,6 +3545,8 @@ int ext4_mb_release(struct super_block *sb) for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); + if (!grinfo) + continue; mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); count = ext4_mb_cleanup_pa(grinfo); @@ -3606,7 +3628,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, { struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); @@ -3622,7 +3644,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb, db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; - count2++; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); @@ -3647,8 +3668,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); - mb_debug(sb, "freed %d blocks in %d structures\n", count, - count2); + mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* @@ -3757,9 +3777,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto out_err; + return PTR_ERR(bitmap_bh); } BUFFER_TRACE(bitmap_bh, "getting write access"); @@ -3822,7 +3840,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); @@ -3929,7 +3947,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, clen = ext4_free_group_clusters(sb, gdp) + clen_changed; ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); @@ -3985,6 +4003,197 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) } /* + * This function returns the next element to look at during inode + * PA rbtree walk. We assume that we have held the inode PA rbtree lock + * (ei->i_prealloc_lock) + * + * new_start The start of the range we want to compare + * cur_start The existing start that we are comparing against + * node The node of the rb_tree + */ +static inline struct rb_node* +ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) +{ + if (new_start < cur_start) + return node->rb_left; + else + return node->rb_right; +} + +static inline void +ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *tmp_pa; + ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; + + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } + read_unlock(&ei->i_prealloc_lock); +} + +/* + * Given an allocation context "ac" and a range "start", "end", check + * and adjust boundaries if the range overlaps with any of the existing + * preallocatoins stored in the corresponding inode of the allocation context. + * + * Parameters: + * ac allocation context + * start start of the new range + * end end of the new range + */ +static inline void +ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t *start, ext4_lblk_t *end) +{ + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; + struct rb_node *iter; + ext4_lblk_t new_start, new_end; + ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + + new_start = *start; + new_end = *end; + + /* + * Adjust the normalized range so that it doesn't overlap with any + * existing preallocated blocks(PAs). Make sure to hold the rbtree lock + * so it doesn't change underneath us. + */ + read_lock(&ei->i_prealloc_lock); + + /* Step 1: find any one immediate neighboring PA of the normalized range */ + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + /* PA must not overlap original request */ + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } + + /* + * Step 2: check if the found PA is left or right neighbor and + * get the other neighbor + */ + if (tmp_pa) { + if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + + left_pa = tmp_pa; + tmp = rb_next(&left_pa->pa_node.inode_node); + if (tmp) { + right_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } else { + struct rb_node *tmp; + + right_pa = tmp_pa; + tmp = rb_prev(&right_pa->pa_node.inode_node); + if (tmp) { + left_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } + } + + /* Step 3: get the non deleted neighbors */ + if (left_pa) { + for (iter = &left_pa->pa_node.inode_node;; + iter = rb_prev(iter)) { + if (!iter) { + left_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + left_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); + } + } + + if (right_pa) { + for (iter = &right_pa->pa_node.inode_node;; + iter = rb_next(iter)) { + if (!iter) { + right_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + right_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); + } + } + + if (left_pa) { + left_pa_end = + left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); + } + + if (right_pa) { + right_pa_start = right_pa->pa_lstart; + BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); + } + + /* Step 4: trim our normalized range to not overlap with the neighbors */ + if (left_pa) { + if (left_pa_end > new_start) + new_start = left_pa_end; + } + + if (right_pa) { + if (right_pa_start < new_end) + new_end = right_pa_start; + } + read_unlock(&ei->i_prealloc_lock); + + /* XXX: extra loop to check we really don't overlap preallocations */ + ext4_mb_pa_assert_overlap(ac, new_start, new_end); + + *start = new_start; + *end = new_end; +} + +/* * Normalization means making request better in terms of * size and alignment */ @@ -3993,13 +4202,12 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; int bsbits, max; ext4_lblk_t end; loff_t size, start_off; loff_t orig_size __maybe_unused; ext4_lblk_t start; - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); - struct ext4_prealloc_space *pa; /* do normalize only data requests, metadata requests do not need preallocation */ @@ -4068,7 +4276,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; - size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; @@ -4100,61 +4308,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, end = start + size; - /* check we don't cross already preallocated blocks */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - if (pa->pa_deleted) - continue; - spin_lock(&pa->pa_lock); - if (pa->pa_deleted) { - spin_unlock(&pa->pa_lock); - continue; - } - - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); - - /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || - ac->ac_o_ex.fe_logical < pa->pa_lstart)); + ext4_mb_pa_adjust_overlap(ac, &start, &end); - /* skip PAs this normalized request doesn't overlap with */ - if (pa->pa_lstart >= end || pa_end <= start) { - spin_unlock(&pa->pa_lock); - continue; - } - BUG_ON(pa->pa_lstart <= start && pa_end >= end); - - /* adjust start or end to be adjacent to this pa */ - if (pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(pa_end < start); - start = pa_end; - } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { - BUG_ON(pa->pa_lstart > end); - end = pa->pa_lstart; - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); size = end - start; - /* XXX: extra loop to check we really don't overlap preallocations */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); - BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); - /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. @@ -4165,7 +4322,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks - * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated + * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) @@ -4188,18 +4345,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ - if (ar->pright && (ar->lright == (start + size))) { + if (ar->pright && (ar->lright == (start + size)) && + ar->pright >= size && + ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - if (ar->pleft && (ar->lleft + 1 == start)) { + if (ar->pleft && (ar->lleft + 1 == start) && + ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } @@ -4247,15 +4407,14 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); - if (err) { + if (WARN_RATELIMIT(err, + "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ - WARN(1, "mb_load_buddy failed (%d)", err); return; - } ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); @@ -4263,8 +4422,11 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) ext4_mb_unload_buddy(&e4b); return; } - if (pa->pa_type == MB_INODE_PA) + if (pa->pa_type == MB_INODE_PA) { + spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } } /* @@ -4292,6 +4454,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); + BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); @@ -4312,14 +4475,14 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; - /* we don't correct pa_pstart or pa_plen here to avoid + /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", - pa->pa_lstart-len, len, pa); + pa->pa_lstart, len, pa); } /* @@ -4361,7 +4524,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa, *cpa = NULL; + struct ext4_prealloc_space *tmp_pa, *cpa = NULL; + ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ @@ -4369,35 +4534,47 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) return false; /* first, try per-file preallocation */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); /* all fields in this condition don't change, * so we can skip locking for them */ - if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= (pa->pa_lstart + - EXT4_C2B(sbi, pa->pa_len))) + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + /* original request start doesn't lie in this PA */ + if (ac->ac_o_ex.fe_logical < tmp_pa_start || + ac->ac_o_ex.fe_logical >= tmp_pa_end) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) - continue; + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any + * other PA to satisfy this. + */ + break; + } /* found preallocated blocks, use them */ - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free) { - atomic_inc(&pa->pa_count); - ext4_mb_use_inode_pa(ac, pa); - spin_unlock(&pa->pa_lock); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); ac->ac_criteria = 10; - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); return true; } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) @@ -4419,16 +4596,16 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], - pa_inode_list) { - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && - pa->pa_free >= ac->ac_o_ex.fe_len) { + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], + pa_node.lg_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && + tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, - pa, cpa); + tmp_pa, cpa); } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } @@ -4454,6 +4631,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, struct ext4_free_data *entry; grp = ext4_get_group_info(sb, group); + if (!grp) + return; n = rb_first(&(grp->bb_free_root)); while (n) { @@ -4481,6 +4660,9 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, int preallocated = 0; int len; + if (!grp) + return; + /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here @@ -4525,16 +4707,22 @@ static void ext4_mb_mark_pa_deleted(struct super_block *sb, } } -static void ext4_mb_pa_callback(struct rcu_head *head) +static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { - struct ext4_prealloc_space *pa; - pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); - + BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + ext4_mb_pa_free(pa); +} + /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed @@ -4544,6 +4732,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, { ext4_group_t grp; ext4_fsblk_t grp_blk; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); @@ -4588,11 +4777,42 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + if (pa->pa_type == MB_INODE_PA) { + write_lock(pa->pa_node_lock.inode_lock); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_free(pa); + } else { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + struct ext4_prealloc_space *iter_pa, *new_pa; + ext4_lblk_t iter_start, new_start; + + while (*iter) { + iter_pa = rb_entry(*iter, struct ext4_prealloc_space, + pa_node.inode_node); + new_pa = rb_entry(new, struct ext4_prealloc_space, + pa_node.inode_node); + iter_start = iter_pa->pa_lstart; + new_start = new_pa->pa_lstart; + + parent = *iter; + if (new_start < iter_start) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + rb_link_node(new, parent, iter); + rb_insert_color(new, root); } /* @@ -4616,10 +4836,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { - int winl; - int wins; - int win; - int offs; + int new_bex_start; + int new_bex_end; /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -4627,38 +4845,47 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); - /* we're limited by original request in that - * logical block must be covered any way - * winl is window we can move our chunk within */ - winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + /* + * Use the below logic for adjusting best extent as it keeps + * fragmentation in check while ensuring logical range of best + * extent doesn't overflow out of goal extent: + * + * 1. Check if best ex can be kept at end of goal and still + * cover original start + * 2. Else, check if best ex can be kept at start of goal and + * still cover original start + * 3. Else, keep the best ex at start of original request. + */ + new_bex_end = ac->ac_g_ex.fe_logical + + EXT4_C2B(sbi, ac->ac_g_ex.fe_len); + new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (ac->ac_o_ex.fe_logical >= new_bex_start) + goto adjust_bex; - /* also, we should cover whole original request */ - wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); + new_bex_start = ac->ac_g_ex.fe_logical; + new_bex_end = + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (ac->ac_o_ex.fe_logical < new_bex_end) + goto adjust_bex; - /* the smallest one defines real window */ - win = min(winl, wins); + new_bex_start = ac->ac_o_ex.fe_logical; + new_bex_end = + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - offs = ac->ac_o_ex.fe_logical % - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (offs && offs < win) - win = offs; +adjust_bex: + ac->ac_b_ex.fe_logical = new_bex_start; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + + EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); } - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; @@ -4667,20 +4894,22 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); - ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); + ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + if (!grp) + return; - pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - spin_lock(pa->pa_obj_lock); - list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); + write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } @@ -4703,16 +4932,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; @@ -4725,10 +4950,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + if (!grp) + return; lg = ac->ac_lg; BUG_ON(lg == NULL); - pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); @@ -4820,7 +5047,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { + ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", + e4b->bd_group, group, pa->pa_pstart); + return 0; + } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); @@ -4846,9 +5077,12 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_prealloc_space *pa, *tmp; struct list_head list; struct ext4_buddy e4b; + struct ext4_inode_info *ei; int err; int free = 0; + if (!grp) + return 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; @@ -4904,17 +5138,26 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + if (pa->pa_type == MB_GROUP_PA) { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + } else { + write_lock(pa->pa_node_lock.inode_lock); + ei = EXT4_I(pa->pa_inode); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + } - if (pa->pa_type == MB_GROUP_PA) + list_del(&pa->u.pa_tmp_list); + + if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); - else + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); - - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); + } } ext4_unlock_group(sb, group); @@ -4944,10 +5187,10 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; + struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) { - /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } @@ -4966,17 +5209,19 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) repeat: /* first, collect all pa's in the inode */ - spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list) && needed) { - pa = list_entry(ei->i_prealloc_list.prev, - struct ext4_prealloc_space, pa_inode_list); - BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + write_lock(&ei->i_prealloc_lock); + for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + iter = rb_next(iter)) { + pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); @@ -4987,7 +5232,7 @@ repeat: if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); needed--; continue; @@ -4995,7 +5240,7 @@ repeat: /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from @@ -5012,7 +5257,7 @@ repeat: schedule_timeout_uninterruptible(HZ); goto repeat; } - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); @@ -5044,7 +5289,7 @@ repeat: put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); } } @@ -5061,14 +5306,20 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) return 0; } -static void ext4_mb_pa_free(struct ext4_allocation_context *ac) +static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); - kmem_cache_free(ext4_pspace_cachep, pa); + /* + * current function is only called due to an error or due to + * len of found blocks < len of requested blocks hence the PA has not + * been added to grp->bb_prealloc_list. So we don't need to lock it + */ + pa->pa_deleted = 1; + ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG @@ -5086,6 +5337,9 @@ static inline void ext4_mb_show_pa(struct super_block *sb) struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; + + if (!grp) + continue; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, @@ -5271,7 +5525,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], - pa_inode_list, + pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { @@ -5294,7 +5548,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); + list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; @@ -5355,7 +5609,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], - pa_inode_list, + pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { @@ -5364,8 +5618,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ - list_add_tail_rcu(&pa->pa_inode_list, - &tmp_pa->pa_inode_list); + list_add_tail_rcu(&pa->pa_node.lg_list, + &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total @@ -5376,7 +5630,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) lg_prealloc_count++; } if (!added) - list_add_tail_rcu(&pa->pa_inode_list, + list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); @@ -5390,29 +5644,10 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } /* - * if per-inode prealloc list is too long, trim some PA - */ -static void ext4_mb_trim_inode_pa(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int count, delta; - - count = atomic_read(&ei->i_prealloc_active); - delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; - if (count > sbi->s_mb_max_inode_prealloc + delta) { - count -= sbi->s_mb_max_inode_prealloc; - ext4_discard_preallocations(inode, count); - } -} - -/* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { - struct inode *inode = ac->ac_inode; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -5432,23 +5667,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * doesn't grow big. */ if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } - if (pa->pa_type == MB_INODE_PA) { - /* - * treat per-inode prealloc list as a lru list, then try - * to trim the least recently used PA. - */ - spin_lock(pa->pa_obj_lock); - list_move(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); - } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) @@ -5458,7 +5683,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - ext4_mb_trim_inode_pa(inode); return 0; } @@ -5611,13 +5835,13 @@ repeat: * So we have to free this pa here itself. */ if (*errp) { - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -5636,20 +5860,19 @@ repeat: * If block allocation fails then the pa allocated above * needs to be freed here itself. */ - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } -errout: if (*errp) { +errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); + kmem_cache_free(ext4_ac_cachep, ac); out: - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { @@ -5693,7 +5916,7 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, kmem_cache_free(ext4_free_data_cachep, entry); } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { @@ -5736,7 +5959,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); - return 0; + return; } } @@ -5762,7 +5985,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); - return 0; } /* @@ -5797,9 +6019,6 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, return 0; } - ext4_get_group_no_and_offset(sb, - max(ext4_group_first_block_no(sb, group), goal), - NULL, &blkoff); while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); @@ -5814,6 +6033,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, brelse(bitmap_bh); if (i < max) break; + + blkoff = 0; } if (group >= ext4_get_groups_count(sb) || i >= max) { @@ -5842,13 +6063,12 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, ext4_get_group_no_and_offset(sb, block, &group, &blkoff); bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return; } gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) - return; + goto err_out; for (i = 0; i < count; i++) { if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) @@ -5857,15 +6077,17 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, mb_clear_bits(bitmap_bh->b_data, blkoff, count); err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); if (err) - return; + goto err_out; ext4_free_group_clusters_set( sb, gdp, ext4_free_group_clusters(sb, gdp) + count - already_freed); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); + +err_out: brelse(bitmap_bh); } @@ -5885,6 +6107,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; + struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -5910,8 +6133,8 @@ do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( - ext4_get_group_info(sb, block_group)))) + grp = ext4_get_group_info(sb, block_group); + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return; /* @@ -6023,7 +6246,7 @@ do_more: ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -6280,7 +6503,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, free_clusters_count = clusters_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, @@ -6513,6 +6736,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); + if (!grp) + continue; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dcda2a943cee..6d85ee8674a6 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -74,11 +74,6 @@ #define MB_DEFAULT_GROUP_PREALLOC 512 /* - * maximum length of inode prealloc list - */ -#define MB_DEFAULT_MAX_INODE_PREALLOC 512 - -/* * Number of groups to search linearly before performing group scanning * optimization. */ @@ -114,7 +109,10 @@ struct ext4_free_data { }; struct ext4_prealloc_space { - struct list_head pa_inode_list; + union { + struct rb_node inode_node; /* for inode PA rbtree */ + struct list_head lg_list; /* for lg PAs */ + } pa_node; struct list_head pa_group_list; union { struct list_head pa_tmp_list; @@ -128,8 +126,11 @@ struct ext4_prealloc_space { ext4_grpblk_t pa_len; /* len of preallocated chunk */ ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ - spinlock_t *pa_obj_lock; - struct inode *pa_inode; /* hack, for history only */ + union { + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ + spinlock_t *lg_lock; /* locks the lg list holding this PA */ + } pa_node_lock; + struct inode *pa_inode; /* used to get the inode during group discard */ }; enum { diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a19a9661646e..d98ac2af8199 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -408,7 +408,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) int ext4_ext_migrate(struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; int retval = 0, i; __le32 *i_data; @@ -418,6 +417,7 @@ int ext4_ext_migrate(struct inode *inode) unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; + int alloc_ctx; /* * If the filesystem does not support extents, or the inode @@ -434,7 +434,7 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block @@ -586,7 +586,7 @@ out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } @@ -605,6 +605,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; + int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -621,7 +622,7 @@ int ext4_ind_migrate(struct inode *inode) if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { @@ -665,6 +666,6 @@ errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 4681fff6665f..0aaf38ffcb6e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -39,28 +39,36 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) * Write the MMP block using REQ_SYNC to try to get the block on-disk * faster. */ -static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +static int write_mmp_block_thawed(struct super_block *sb, + struct buffer_head *bh) { struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); - /* - * We protect against freezing so that we don't create dirty buffers - * on frozen filesystem. - */ - sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); - sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return -EIO; - return 0; } +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +{ + int err; + + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); + err = write_mmp_block_thawed(sb, bh); + sb_end_write(sb); + return err; +} + /* * Read the MMP block. It _must_ be read from disk and hence we clear the * uptodate flag on the buffer. @@ -282,6 +290,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (mmp_block < le32_to_cpu(es->s_first_data_block) || mmp_block >= ext4_blocks_count(es)) { ext4_warning(sb, "Invalid MMP block in superblock"); + retval = -EINVAL; goto failed; } @@ -307,6 +316,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (seq == EXT4_MMP_SEQ_FSCK) { dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + retval = -EBUSY; goto failed; } @@ -320,6 +330,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + retval = -ETIMEDOUT; goto failed; } @@ -330,6 +341,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (seq != le32_to_cpu(mmp->mmp_seq)) { dump_mmp_msg(sb, mmp, "Device is already active on another node."); + retval = -EBUSY; goto failed; } @@ -340,7 +352,11 @@ skip: seq = mmp_new_seq(); mmp->mmp_seq = cpu_to_le32(seq); - retval = write_mmp_block(sb, bh); + /* + * On mount / remount we are protected against fs freezing (by s_umount + * semaphore) and grabbing freeze protection upsets lockdep + */ + retval = write_mmp_block_thawed(sb, bh); if (retval) goto failed; @@ -349,6 +365,7 @@ skip: */ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ext4_warning(sb, "MMP startup interrupted, failing mount"); + retval = -ETIMEDOUT; goto failed; } @@ -359,6 +376,7 @@ skip: if (seq != le32_to_cpu(mmp->mmp_seq)) { dump_mmp_msg(sb, mmp, "Device is already active on another node."); + retval = -EBUSY; goto failed; } @@ -378,6 +396,7 @@ skip: EXT4_SB(sb)->s_mmp_tsk = NULL; ext4_warning(sb, "Unable to create kmmpd thread for %s.", sb->s_id); + retval = -ENOMEM; goto failed; } @@ -385,5 +404,5 @@ skip: failed: brelse(bh); - return 1; + return retval; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2de9829aed63..b5af2fc03b2f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -126,7 +126,6 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, { struct address_space *mapping[2]; unsigned int flags; - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -139,20 +138,20 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, + folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); - if (!folio[0]) { + if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); - return -ENOMEM; + return PTR_ERR(folio[0]); } - folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, + folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!folio[1]) { + if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); - return -ENOMEM; + return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if @@ -169,25 +168,27 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, /* Force page buffers uptodate w/o dropping page's lock */ static int -mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; sector_t block; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, block_start, block_end; int i, err, nr = 0, partial = 0; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; blocksize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } - head = page_buffers(page); - block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits); + block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; @@ -201,11 +202,11 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); if (err) { - SetPageError(page); + folio_set_error(folio); return err; } if (!buffer_mapped(bh)) { - zero_user(page, block_start, blocksize); + folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } @@ -227,7 +228,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } out: if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -355,7 +356,7 @@ again: goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); + *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a5010b5b8a8c..45b579805c95 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -674,7 +674,7 @@ static struct stats dx_show_leaf(struct inode *dir, len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ - ext4fs_dirhash(dir, de->name, + (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, @@ -709,8 +709,9 @@ static struct stats dx_show_leaf(struct inode *dir, if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else - ext4fs_dirhash(dir, de->name, - de->name_len, &h); + (void) ext4fs_dirhash(dir, + de->name, + de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -720,7 +721,8 @@ static struct stats dx_show_leaf(struct inode *dir, #else int len = de->name_len; char *name = de->name; - ext4fs_dirhash(dir, de->name, de->name_len, &h); + (void) ext4fs_dirhash(dir, de->name, + de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif @@ -849,8 +851,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && - !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { + int ret = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), hinfo); + if (ret < 0) { + ret_err = ERR_PTR(ret); + goto fail; + } + } hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -1111,7 +1119,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, hinfo->minor_hash = 0; } } else { - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + err = ext4fs_dirhash(dir, de->name, + de->name_len, hinfo); + if (err < 0) { + count = err; + goto errout; + } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && @@ -1313,8 +1326,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); - else - ext4fs_dirhash(dir, de->name, de->name_len, &h); + else { + int err = ext4fs_dirhash(dir, de->name, + de->name_len, &h); + if (err < 0) + return err; + } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1452,10 +1469,9 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) - ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else - ext4fs_dirhash(dir, iname->name, iname->len, hinfo); - return 0; + return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif @@ -2298,10 +2314,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ - if (!ext4_hash_in_dirent(dir)) - ext4fs_dirhash(dir, fname_name(fname), - fname_len(fname), &fname->hinfo); - + if (!ext4_hash_in_dirent(dir)) { + int err = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); + if (err < 0) { + brelse(bh2); + brelse(bh); + return err; + } + } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1e4db96a04e6..3621f29ec671 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -99,30 +99,30 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct page *bounce_page = NULL; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct folio *io_folio = NULL; struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; + size_t bio_start = fi.offset; + size_t bio_end = bio_start + fi.length; unsigned under_io = 0; unsigned long flags; - if (fscrypt_is_bounce_page(page)) { - bounce_page = page; - page = fscrypt_pagecache_page(bounce_page); + if (fscrypt_is_bounce_folio(folio)) { + io_folio = folio; + folio = fscrypt_pagecache_folio(folio); } if (bio->bi_status) { - SetPageError(page); - mapping_set_error(page->mapping, -EIO); + int err = blk_status_to_errno(bio->bi_status); + folio_set_error(folio); + mapping_set_error(folio->mapping, err); } - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* - * We check all buffers in the page under b_uptodate_lock + * We check all buffers in the folio under b_uptodate_lock * to avoid races with other end io clearing async_write flags */ spin_lock_irqsave(&head->b_uptodate_lock, flags); @@ -141,8 +141,8 @@ static void ext4_finish_bio(struct bio *bio) } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { - fscrypt_free_bounce_page(bounce_page); - end_page_writeback(page); + fscrypt_free_bounce_page(&io_folio->page); + folio_end_writeback(folio); } } } @@ -409,12 +409,10 @@ static void io_submit_init_bio(struct ext4_io_submit *io, static void io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct page *pagecache_page, - struct page *bounce_page, + struct folio *folio, + struct folio *io_folio, struct buffer_head *bh) { - int ret; - if (io->io_bio && (bh->b_blocknr != io->io_next_block || !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { submit_and_retry: @@ -422,20 +420,17 @@ submit_and_retry: } if (io->io_bio == NULL) io_submit_init_bio(io, bh); - ret = bio_add_page(io->io_bio, bounce_page ?: pagecache_page, - bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) + if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, pagecache_page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); io->io_next_block++; } -int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len) +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, + size_t len) { - struct page *bounce_page = NULL; - struct inode *inode = page->mapping->host; + struct folio *io_folio = folio; + struct inode *inode = folio->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; @@ -443,30 +438,30 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc = io->io_wbc; bool keep_towrite = false; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); - ClearPageError(page); + folio_clear_error(folio); /* * Comments copied from block_write_full_page: * - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (len < folio_size(folio)) + folio_zero_segment(folio, len, folio_size(folio)); /* * In the first loop we prepare and mark buffers to submit. We have to - * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_end_bio() when IO + * mark all buffers in the folio before submitting so that + * folio_end_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_start = bh_offset(bh); if (block_start >= len) { @@ -481,14 +476,16 @@ int ext4_bio_write_page(struct ext4_io_submit *io, clear_buffer_dirty(bh); /* * Keeping dirty some buffer we cannot write? Make sure - * to redirty the page and keep TOWRITE tag so that - * racing WB_SYNC_ALL writeback does not skip the page. + * to redirty the folio and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the folio. * This happens e.g. when doing writeout for - * transaction commit. + * transaction commit or when journalled data is not + * yet committed. */ - if (buffer_dirty(bh)) { - if (!PageDirty(page)) - redirty_page_for_writepage(wbc, page); + if (buffer_dirty(bh) || + (buffer_jbd(bh) && buffer_jbddirty(bh))) { + if (!folio_test_dirty(folio)) + folio_redirty_for_writepage(wbc, folio); keep_towrite = true; } continue; @@ -500,11 +497,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, nr_to_submit++; } while ((bh = bh->b_this_page) != head); - /* Nothing to submit? Just unlock the page... */ + /* Nothing to submit? Just unlock the folio... */ if (!nr_to_submit) - goto unlock; + return 0; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* * If any blocks are being written to an encrypted file, encrypt them @@ -513,9 +510,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * (e.g. holes) to be unnecessarily encrypted, but this is rare and * can't happen in the common case of blocksize == PAGE_SIZE. */ - if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + struct page *bounce_page; /* * Since bounce page allocation uses a mempool, we can only use @@ -525,8 +523,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, - 0, gfp_flags); + bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && @@ -542,7 +540,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); do { if (buffer_async_write(bh)) { clear_buffer_async_write(bh); @@ -550,22 +548,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } bh = bh->b_this_page; } while (bh != head); - goto unlock; + + return ret; } + io_folio = page_folio(bounce_page); } - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); + __folio_start_writeback(folio, keep_towrite); /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - io_submit_add_bh(io, inode, page, bounce_page, bh); + io_submit_add_bh(io, inode, folio, io_folio, bh); } while ((bh = bh->b_this_page) != head); -unlock: - unlock_page(page); - return ret; + + return 0; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c61dc8a7c014..6f46823fba61 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -68,18 +68,16 @@ struct bio_post_read_ctx { static void __read_end_io(struct bio *bio) { - struct page *page; - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; if (bio->bi_status) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); @@ -218,7 +216,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) } int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page) + struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -247,16 +245,15 @@ int ext4_mpage_readpages(struct inode *inode, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (rac) { - page = readahead_page(rac); - prefetchw(&page->flags); - } + if (rac) + folio = readahead_folio(rac); + prefetchw(&folio->flags); - if (page_has_buffers(page)) + if (folio_buffers(folio)) goto confused; block_in_file = next_block = - (sector_t)page->index << (PAGE_SHIFT - blkbits); + (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; @@ -290,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, /* * Then do more ext4_map_blocks() calls until we are - * done with this page. + * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { @@ -299,10 +296,10 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - SetPageError(page); - zero_user_segment(page, 0, - PAGE_SIZE); - unlock_page(page); + folio_set_error(folio); + folio_zero_segment(folio, 0, + folio_size(folio)); + folio_unlock(folio); goto next_page; } } @@ -333,22 +330,22 @@ int ext4_mpage_readpages(struct inode *inode, } } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, - PAGE_SIZE); + folio_zero_segment(folio, first_hole << blkbits, + folio_size(folio)); if (first_hole == 0) { - if (ext4_need_verity(inode, page->index) && - !fsverity_verify_page(page)) + if (ext4_need_verity(inode, folio->index) && + !fsverity_verify_page(&folio->page)) goto set_error_page; - SetPageUptodate(page); - unlock_page(page); - goto next_page; + folio_mark_uptodate(folio); + folio_unlock(folio); + continue; } } else if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); } /* - * This page will go to BIO. Do we need to send this + * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || @@ -366,7 +363,7 @@ int ext4_mpage_readpages(struct inode *inode, REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); - ext4_set_bio_post_read_ctx(bio, inode, page->index); + ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) @@ -374,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, } length = first_hole << blkbits; - if (bio_add_page(bio, page, length, 0) < length) + if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && @@ -384,19 +381,18 @@ int ext4_mpage_readpages(struct inode *inode, bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; - goto next_page; + continue; confused: if (bio) { submit_bio(bio); bio = NULL; } - if (!PageUptodate(page)) - block_read_full_folio(page_folio(page), ext4_get_block); + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, ext4_get_block); else - unlock_page(page); - next_page: - if (rac) - put_page(page); + folio_unlock(folio); +next_page: + ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6b91443d6bf3..0361c20910de 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1306,7 +1306,6 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) } static int ext4_set_bitmap_checksums(struct super_block *sb, - ext4_group_t group, struct ext4_group_desc *gdp, struct ext4_new_group_data *group_data) { @@ -1318,14 +1317,14 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; - ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + ext4_inode_bitmap_csum_set(sb, gdp, bh, EXT4_INODES_PER_GROUP(sb) / 8); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); if (!bh) return -EIO; - ext4_block_bitmap_csum_set(sb, group, gdp, bh); + ext4_block_bitmap_csum_set(sb, gdp, bh); brelse(bh); return 0; @@ -1363,7 +1362,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); - err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + err = ext4_set_bitmap_checksums(sb, gdp, group_data); if (err) { ext4_std_error(sb, err); break; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f43e526112ae..9680fe753e59 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1048,6 +1048,8 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb, struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; + if (!grp || !gdp) + return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); @@ -1183,14 +1185,83 @@ static inline void ext4_quota_off_umount(struct super_block *sb) } #endif +static int ext4_percpu_param_init(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t block; + int err; + + block = ext4_count_free_clusters(sbi->s_sb); + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sbi->s_sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sbi->s_sb), GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, + GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_writepages_rwsem); + + if (err) + ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); + + return err; +} + +static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); + percpu_free_rwsem(&sbi->s_writepages_rwsem); +} + +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + +static void ext4_flex_groups_free(struct ext4_sb_info *sbi) +{ + struct flex_groups **flex_groups; + int i; + + rcu_read_lock(); + flex_groups = rcu_dereference(sbi->s_flex_groups); + if (flex_groups) { + for (i = 0; i < sbi->s_flex_groups_allocated; i++) + kvfree(flex_groups[i]); + kvfree(flex_groups); + } + rcu_read_unlock(); +} + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - struct buffer_head **group_desc; - struct flex_groups **flex_groups; int aborted = 0; - int i, err; + int err; /* * Unregister sysfs before destroying jbd2 journal. @@ -1238,26 +1309,11 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb)) ext4_commit_super(sb); - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - flex_groups = rcu_dereference(sbi->s_flex_groups); - if (flex_groups) { - for (i = 0; i < sbi->s_flex_groups_allocated; i++) - kvfree(flex_groups[i]); - kvfree(flex_groups); - } - rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); - percpu_free_rwsem(&sbi->s_writepages_rwsem); + ext4_group_desc_free(sbi); + ext4_flex_groups_free(sbi); + ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA - for (i = 0; i < EXT4_MAXQUOTAS; i++) + for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif @@ -1325,9 +1381,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); - INIT_LIST_HEAD(&ei->i_prealloc_list); + ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); - spin_lock_init(&ei->i_prealloc_lock); + rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); @@ -2500,7 +2556,7 @@ static void ext4_apply_quota_options(struct fs_context *fc, qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) - kfree_rcu(qname); + kfree_rcu_mightsleep(qname); } } @@ -3184,11 +3240,9 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ - if (ext4_has_feature_64bit(sb) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) + if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + sbi->s_desc_size - offset); out: return cpu_to_le16(crc); @@ -4587,6 +4641,8 @@ static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || @@ -4656,14 +4712,59 @@ static int ext4_check_feature_compatibility(struct super_block *sb, if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; + if (sbi->s_daxdev) { + if (sb->s_blocksize == PAGE_SIZE) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + else + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + } + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { + if (ext4_has_feature_inline_data(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" + " that may contain inline data"); + return -EINVAL; + } + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, + "DAX unsupported by block device."); + return -EINVAL; + } + } + + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + return -EINVAL; + } + return 0; } -static int ext4_geometry_check(struct super_block *sb, +static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; + int err; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + return -EINVAL; + } + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + return err; + } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); @@ -4719,19 +4820,6 @@ static int ext4_geometry_check(struct super_block *sb, return 0; } -static void ext4_group_desc_free(struct ext4_sb_info *sbi) -{ - struct buffer_head **group_desc; - int i; - - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - rcu_read_unlock(); -} - static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, @@ -4881,7 +4969,7 @@ out: return -EINVAL; } -static int ext4_journal_data_mode_check(struct super_block *sb) +static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " @@ -5024,18 +5112,92 @@ out: return ret; } +static void ext4_hash_info_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + unsigned int i; + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + + sbi->s_def_hash_version = es->s_def_hash_version; + if (ext4_has_feature_dir_index(sb)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + } +} + +static int ext4_block_group_meta_init(struct super_block *sb, int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int has_huge_files; + + has_huge_files = ext4_has_feature_huge_file(sb); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (ext4_has_feature_64bit(sb)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + return -EINVAL; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + return -EINVAL; + } + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_inodes_per_group); + return -EINVAL; + } + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + return 0; +} + static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups **flex_groups; - ext4_fsblk_t block; ext4_fsblk_t logical_sb_block; struct inode *root; - int ret = -ENOMEM; - unsigned int i; - int needs_recovery, has_huge_files; - int err = 0; + int needs_recovery; + int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; @@ -5048,8 +5210,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - /* -EINVAL is default */ - ret = -EINVAL; err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; @@ -5075,7 +5235,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (ext4_inode_info_init(sb, es)) + err = ext4_inode_info_init(sb, es); + if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); @@ -5091,10 +5252,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); - if (ext4_encoding_init(sb, es)) + err = ext4_encoding_init(sb, es); + if (err) goto failed_mount; - if (ext4_journal_data_mode_check(sb)) + err = ext4_check_journal_data_mode(sb); + if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5103,119 +5266,22 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; - if (ext4_check_feature_compatibility(sb, es, silent)) - goto failed_mount; - - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { - ext4_msg(sb, KERN_ERR, - "Number of reserved GDT blocks insanely large: %d", - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); - goto failed_mount; - } - - if (sbi->s_daxdev) { - if (sb->s_blocksize == PAGE_SIZE) - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); - else - ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); - } - - if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { - if (ext4_has_feature_inline_data(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" - " that may contain inline data"); - goto failed_mount; - } - if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { - ext4_msg(sb, KERN_ERR, - "DAX unsupported by block device."); - goto failed_mount; - } - } - - if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { - ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", - es->s_encryption_level); + err = ext4_check_feature_compatibility(sb, es, silent); + if (err) goto failed_mount; - } - - has_huge_files = ext4_has_feature_huge_file(sb); - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, - has_huge_files); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); - if (ext4_has_feature_64bit(sb)) { - if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || - sbi->s_desc_size > EXT4_MAX_DESC_SIZE || - !is_power_of_2(sbi->s_desc_size)) { - ext4_msg(sb, KERN_ERR, - "unsupported descriptor size %lu", - sbi->s_desc_size); - goto failed_mount; - } - } else - sbi->s_desc_size = EXT4_MIN_DESC_SIZE; - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - - sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); - goto failed_mount; - } - if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || - sbi->s_inodes_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", - sbi->s_inodes_per_group); + err = ext4_block_group_meta_init(sb, silent); + if (err) goto failed_mount; - } - sbi->s_itb_per_group = sbi->s_inodes_per_group / - sbi->s_inodes_per_block; - sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); - sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; - sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - for (i = 0; i < 4; i++) - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; - if (ext4_has_feature_dir_index(sb)) { - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { -#ifdef __CHAR_UNSIGNED__ - if (!sb_rdonly(sb)) - es->s_flags |= - cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; -#else - if (!sb_rdonly(sb)) - es->s_flags |= - cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); -#endif - } - } + ext4_hash_info_init(sb); - if (ext4_handle_clustersize(sb)) - goto failed_mount; - - /* - * Test whether we have more sectors than will fit in sector_t, - * and whether the max offset is addressable by the page cache. - */ - err = generic_check_addressable(sb->s_blocksize_bits, - ext4_blocks_count(es)); - if (err) { - ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely on this system"); + err = ext4_handle_clustersize(sb); + if (err) goto failed_mount; - } - if (ext4_geometry_check(sb, es)) + err = ext4_check_geometry(sb, es); + if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); @@ -5226,8 +5292,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount3; - /* Register extent status tree shrinker */ - if (ext4_es_register_shrinker(sbi)) + err = ext4_es_register_shrinker(sbi); + if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); @@ -5266,10 +5332,13 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); - if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) - if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { + err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); + if (err) goto failed_mount3a; + } + err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -5321,6 +5390,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); + err = -EINVAL; goto failed_mount_wq; } @@ -5329,6 +5399,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); + err = -EINVAL; goto failed_mount_wq; } } @@ -5363,7 +5434,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount4; } @@ -5375,28 +5446,28 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); - ret = PTR_ERR(root); + err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); + err = -EFSCORRUPTED; goto failed_mount4; } sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount4; } - ret = ext4_setup_super(sb, es, sb_rdonly(sb)); - if (ret == -EROFS) { + err = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (err == -EROFS) { sb->s_flags |= SB_RDONLY; - ret = 0; - } else if (ret) + } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); @@ -5440,40 +5511,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - block = ext4_count_free_clusters(sb); - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block, - GFP_KERNEL); - if (!err) { - unsigned long freei = ext4_count_free_inodes(sb); - sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, - GFP_KERNEL); - } - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb), GFP_KERNEL); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, - GFP_KERNEL); - if (!err) - err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, - GFP_KERNEL); - if (!err) - err = percpu_init_rwsem(&sbi->s_writepages_rwsem); - - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + err = ext4_percpu_param_init(sbi); + if (err) goto failed_mount6; - } if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount6; } @@ -5548,20 +5595,8 @@ failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); - rcu_read_lock(); - flex_groups = rcu_dereference(sbi->s_flex_groups); - if (flex_groups) { - for (i = 0; i < sbi->s_flex_groups_allocated; i++) - kvfree(flex_groups[i]); - kvfree(flex_groups); - } - rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); - percpu_free_rwsem(&sbi->s_writepages_rwsem); + ext4_flex_groups_free(sbi); + ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); @@ -5602,7 +5637,7 @@ failed_mount: #endif #ifdef CONFIG_QUOTA - for (i = 0; i < EXT4_MAXQUOTAS; i++) + for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); @@ -5611,7 +5646,7 @@ failed_mount: ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; - return err ? err : ret; + return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) @@ -5649,8 +5684,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. " - "Quota mode: %s.", &sb->s_uuid, descr, + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " + "Quota mode: %s.", &sb->s_uuid, + sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ @@ -6352,6 +6388,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; + int enable_rw = 0; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6538,13 +6575,13 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (err) goto restore_opts; - sb->s_flags &= ~SB_RDONLY; - if (ext4_has_feature_mmp(sb)) - if (ext4_multi_mount_protect(sb, - le64_to_cpu(es->s_mmp_block))) { - err = -EROFS; + enable_rw = 1; + if (ext4_has_feature_mmp(sb)) { + err = ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block)); + if (err) goto restore_opts; - } + } #ifdef CONFIG_QUOTA enable_quota = 1; #endif @@ -6581,9 +6618,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } #ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); @@ -6593,16 +6627,29 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } } + /* Release old quota file names */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); + if (enable_rw) + sb->s_flags &= ~SB_RDONLY; + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return 0; restore_opts: + /* + * If there was a failing r/w to ro transition, we may need to + * re-enable quota + */ + if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6643,8 +6690,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.", - &sb->s_uuid, ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", + &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", + ext4_quota_mode(sb)); return 0; } @@ -6870,23 +6918,6 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(d_inode(path->dentry))) { - /* - * We don't need to lock updates but journal_flush() could - * otherwise be livelocked... - */ - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - if (err) - return err; - } - lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 12d6252e3e22..3042bc605bbf 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -214,7 +214,6 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); @@ -264,7 +263,6 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), - ATTR_LIST(mb_max_inode_prealloc), ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index e4da1704438e..2f37e1ea3955 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -42,18 +42,16 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, loff_t pos) { while (count) { - size_t n = min_t(size_t, count, - PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; + size_t n; - page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - - memcpy_from_page(buf, page, offset_in_page(pos), n); + if (IS_ERR(folio)) + return PTR_ERR(folio); - put_page(page); + n = memcpy_from_file_folio(buf, folio, pos, count); + folio_put(folio); buf += n; pos += n; @@ -363,21 +361,23 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (!page || !PageUptodate(page)) { + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); - if (page) - put_page(page); + if (!IS_ERR(folio)) + folio_put(folio); else if (num_ra_pages > 1) page_cache_ra_unbounded(&ractl, num_ra_pages, 0); - page = read_mapping_page(inode->i_mapping, index, NULL); + folio = read_mapping_folio(inode->i_mapping, index, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - return page; + return folio_file_page(folio, index); } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 767454d74cd6..dfc2e223bd10 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -88,8 +88,8 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -101,10 +101,6 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif @@ -173,14 +169,18 @@ static void ext4_xattr_block_csum_set(struct inode *inode, bh->b_blocknr, BHDR(bh)); } -static inline const struct xattr_handler * -ext4_xattr_handler(int name_index) +static inline const char *ext4_xattr_prefix(int name_index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static int @@ -740,11 +740,10 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext4_xattr_handler(entry->e_name_index); + const char *prefix; - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; + prefix = ext4_xattr_prefix(entry->e_name_index, dentry); + if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; @@ -2615,6 +2614,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); @@ -2637,7 +2637,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, error = -ENOMEM; goto out; } - + needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; @@ -2676,7 +2676,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, out: kfree(b_entry_name); - if (entry->e_value_inum && buffer) + if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); |