diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.h | 1 | ||||
-rw-r--r-- | fs/btrfs/dev-replace.c | 7 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 1 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 44 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 18 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 26 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 1 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 3 | ||||
-rw-r--r-- | fs/btrfs/zoned.h | 4 | ||||
-rw-r--r-- | fs/ceph/caps.c | 7 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 6 | ||||
-rw-r--r-- | fs/f2fs/checkpoint.c | 6 | ||||
-rw-r--r-- | fs/f2fs/data.c | 33 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 9 | ||||
-rw-r--r-- | fs/f2fs/inode.c | 3 | ||||
-rw-r--r-- | fs/f2fs/segment.c | 95 | ||||
-rw-r--r-- | fs/f2fs/super.c | 32 | ||||
-rw-r--r-- | fs/gfs2/file.c | 6 | ||||
-rw-r--r-- | fs/io_uring.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.h | 42 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 24 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 2 | ||||
-rw-r--r-- | fs/zonefs/super.c | 46 |
24 files changed, 196 insertions, 233 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7631b88426e..077c95e9baa5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1060,6 +1060,7 @@ struct btrfs_fs_info { */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; u64 nr_global_roots; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 71fd99b48283..f26202621989 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* Commit dev_replace state and reserve 1 item for it. */ + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 126f244cdf88..ed8e288cc369 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3157,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 724e8fe06aa0..33c19f51d79b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode, repair_bio = btrfs_bio_alloc(1); repair_bbio = btrfs_bio(repair_bio); + repair_bbio->file_offset = start; repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; @@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode, ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5082b9c70f8c..1c8a43ecfb9f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7810,8 +7810,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - const u64 orig_file_offset = dip->file_offset; - u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; @@ -7821,6 +7819,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; for (i = 0; i < nr_sectors; i++) { + u64 start = bbio->file_offset + bio_offset; + ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, bbio, @@ -7833,17 +7833,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ASSERT((start - orig_file_offset) < UINT_MAX); - ret = btrfs_repair_one_sector(inode, - &bbio->bio, - start - orig_file_offset, - bvec.bv_page, pgoff, + ret = btrfs_repair_one_sector(inode, &bbio->bio, + bio_offset, bvec.bv_page, pgoff, start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); } - start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); bio_offset += sectorsize; pgoff += sectorsize; @@ -7870,6 +7866,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t err = bio->bi_status; if (err) @@ -7880,12 +7877,12 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); + err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) dip->dio_bio->bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); + btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8046,6 +8043,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 11089568b287..8cd713d37ad2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + ASSERT(cache->start <= chunk_offset); + /* + * We are using the commit root to search for device extents, so + * that means we could have found a device extent item from a + * block group that was deleted in the current transaction. The + * logical start offset of the deleted block group, stored at + * @chunk_offset, might be part of the logical address range of + * a new block group (which uses different physical extents). + * In this case btrfs_lookup_block_group() has returned the new + * block group, and its start address is less than @chunk_offset. + * + * We skip such new block groups, because it's pointless to + * process them, as we won't find their extents because we search + * for them using the commit root of the extent tree. For a device + * replace it's also fine to skip it, we won't miss copying them + * to the target device because we have the write duplication + * setup through the regular write path (by btrfs_map_block()), + * and we have committed a transaction when we started the device + * replace, right after setting up the device replace state. + */ + if (cache->start < chunk_offset) { + btrfs_put_block_group(cache); + goto skip; + } + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { spin_lock(&cache->lock); if (!cache->to_copy) { @@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ASSERT(cache->start == chunk_offset); ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 571dae8ad65e..09e4f1a04e6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { mutex_unlock(&fs_info->tree_root->log_mutex); + blk_finish_plug(&plug); goto out; } } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bd297f23d19e..f3e28f11cfb6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -328,6 +328,9 @@ struct btrfs_fs_devices { struct btrfs_bio { unsigned int mirror_num; + /* for direct I/O */ + u64 file_offset; + /* @device is for stripe IO submission. */ struct btrfs_device *device; u8 *csum; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index cbf016a7bb5d..6dee76248cb4 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_lock(&inode->vfs_inode, 0); + mutex_lock(&root->fs_info->zoned_data_reloc_io_lock); } static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) @@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_unlock(&inode->vfs_inode, 0); + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); } #endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f1ad6884d4da..5c14ef04e474 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2274,6 +2274,8 @@ retry: list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; + if (!s) + continue; if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); for (i = 0; i < max_sessions; i++) { @@ -2294,6 +2296,8 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; + if (!s) + continue; if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); for (i = 0; i < max_sessions; i++) { @@ -3870,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", inode, ci, mds, mseq, target); retry: + down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) @@ -3933,6 +3938,7 @@ retry: } spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ @@ -3958,6 +3964,7 @@ retry: out_unlock: spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fa38c013126d..00c3de177dd6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4434,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - struct ceph_fs_client *fsc = s->s_mdsc->fsc; - switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4444,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when not force unmounting */ - WARN_ON_ONCE(s->s_ttl && - READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); - fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: case CEPH_MDS_SESSION_CLOSED: diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f5366feea82d..909085a78f9c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,9 +98,9 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs && - sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { - set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (page->index == sbi->metapage_eio_ofs) { + if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); } else { sbi->metapage_eio_ofs = page->index; sbi->metapage_eio_cnt = 0; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8e0c2e773c8d..9a1a526f2092 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -388,11 +388,23 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int fua_flag = io_flag & temp_mask; - unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + unsigned int fua_flag, meta_flag, io_flag; + unsigned int op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: @@ -401,9 +413,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) * Cold | Warm | Hot | Cold | Warm | Hot | */ if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; + op_flags |= REQ_META; if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; + op_flags |= REQ_FUA; + return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) @@ -413,14 +426,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; - if (fio->type == DATA) - __attach_io_flag(fio, sbi->data_io_flag); - else if (fio->type == NODE) - __attach_io_flag(fio, sbi->node_io_flag); - bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); - bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO, - &f2fs_bioset); + bio = bio_alloc_bioset(bdev, npages, + fio->op | fio->op_flags | f2fs_io_flags(fio), + GFP_NOIO, &f2fs_bioset); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cd1e65bcf0b0..8c570de21ed5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -154,7 +154,6 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ @@ -1334,12 +1333,6 @@ enum { }; enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ -}; - -enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; @@ -3657,8 +3650,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 71f232dcf3c2..83639238a1fe 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -550,7 +550,8 @@ make_now: } f2fs_set_inode_flags(inode); - if (file_should_truncate(inode)) { + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { ret = f2fs_truncate(inode); if (ret) goto bad_inode; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 22dfeb991529..bd9731cdec56 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3243,101 +3243,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea939db18f88..4368f90571bd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -138,7 +138,6 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -214,7 +213,6 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, @@ -975,22 +973,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "user-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (!strcmp(name, "off")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (!strcmp(name, "fs-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) @@ -1328,12 +1310,6 @@ default_check: return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_PERSIST_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1978,10 +1954,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); @@ -2033,7 +2005,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); @@ -2314,8 +2285,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 22b41acfbbc3..48f01323c37c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -899,10 +899,10 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) goto out; +retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, @@ -991,8 +991,6 @@ retry_under_glock: if (leftover != window_size) { if (gfs2_holder_queued(&gh)) goto retry_under_glock; - if (written) - goto out_uninit; goto retry; } } @@ -1069,8 +1067,6 @@ retry_under_glock: from->count = min(from->count, window_size - leftover); if (gfs2_holder_queued(gh)) goto retry_under_glock; - if (read && !(iocb->ki_flags & IOCB_DIRECT)) - goto out_uninit; goto retry; } } diff --git a/fs/io_uring.c b/fs/io_uring.c index dfebbf3a272a..5a0388bac42c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4012,6 +4012,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; @@ -5688,6 +5689,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); @@ -5921,6 +5924,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); @@ -11210,7 +11215,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, ret = -EFAULT; break; } - if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) { + if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e1afb9e503e1..bf4e60871068 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -406,7 +406,7 @@ xfs_buf_alloc_pages( STATIC int _xfs_buf_map_pages( struct xfs_buf *bp, - uint flags) + xfs_buf_flags_t flags) { ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { @@ -868,7 +868,7 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -903,7 +903,7 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index edcb6254fa6a..1ee3056ff9cf 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,28 +22,28 @@ struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ -#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ +#define XBF_READ (1u << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ +#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ -#define _XBF_INODES (1 << 16)/* inode buffer */ -#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ -#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ +#define _XBF_INODES (1u << 16)/* inode buffer */ +#define _XBF_DQUOTS (1u << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1u << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ +#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -58,7 +58,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ { _XBF_DQUOTS, "DQUOTS" }, \ - { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ @@ -247,11 +247,11 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, - struct xfs_buf **bpp); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, int flags, struct xfs_buf **bpp, - const struct xfs_buf_ops *ops); + size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9de6205fe134..39ae53efb3ab 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2594,14 +2594,13 @@ xfs_ifree_cluster( } /* - * This is called to return an inode to the inode free list. - * The inode should already be truncated to 0 length and have - * no pages associated with it. This routine also assumes that - * the inode is already a part of the transaction. + * This is called to return an inode to the inode free list. The inode should + * already be truncated to 0 length and have no pages associated with it. This + * routine also assumes that the inode is already a part of the transaction. * - * The on-disk copy of the inode will have been added to the list - * of unlinked inodes in the AGI. We need to remove the inode from - * that list atomically with respect to freeing it here. + * The on-disk copy of the inode will have been added to the list of unlinked + * inodes in the AGI. We need to remove the inode from that list atomically with + * respect to freeing it here. */ int xfs_ifree( @@ -2623,13 +2622,16 @@ xfs_ifree( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* - * Pull the on-disk inode from the AGI unlinked list. + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. */ - error = xfs_iunlink_remove(tp, pag, ip); + error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - goto out; + return error; - error = xfs_difree(tp, pag, ip->i_ino, &xic); + error = xfs_iunlink_remove(tp, pag, ip); if (error) goto out; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index de177842b951..0c82673238f4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -175,7 +175,7 @@ xfs_trans_get_buf( struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3614c7834007..e20e7c841489 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -35,6 +35,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode, lockdep_assert_held(&zi->i_truncate_mutex); + /* + * With ZNS drives, closing an explicitly open zone that has not been + * written will change the zone state to "closed", that is, the zone + * will remain active. Since this can then cause failure of explicit + * open operation on other zones if the drive active zone resources + * are exceeded, make sure that the zone does not remain active by + * resetting it. + */ + if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + op = REQ_OP_ZONE_RESET; + trace_zonefs_zone_mgmt(inode, op); ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); @@ -1142,6 +1153,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; + zi->i_flags = 0; return &zi->i_vnode; } @@ -1293,12 +1305,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, inc_nlink(parent); } -static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; @@ -1323,6 +1336,22 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + + /* + * For sequential zones, make sure that any open zone is closed first + * to ensure that the initial number of open zones is 0, in sync with + * the open zone accounting done when the mount option + * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (type == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + mutex_lock(&zi->i_truncate_mutex); + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + mutex_unlock(&zi->i_truncate_mutex); + } + + return ret; } static struct dentry *zonefs_create_inode(struct dentry *parent, @@ -1332,6 +1361,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, struct inode *dir = d_inode(parent); struct dentry *dentry; struct inode *inode; + int ret; dentry = d_alloc_name(parent, name); if (!dentry) @@ -1342,10 +1372,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, goto dput; inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) - zonefs_init_file_inode(inode, zone, type); - else + if (zone) { + ret = zonefs_init_file_inode(inode, zone, type); + if (ret) { + iput(inode); + goto dput; + } + } else { zonefs_init_dir_inode(dir, inode, type); + } + d_add(dentry, inode); dir->i_size++; |