diff options
Diffstat (limited to 'libbcachefs/fs-io.c')
-rw-r--r-- | libbcachefs/fs-io.c | 367 |
1 files changed, 225 insertions, 142 deletions
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 3900995d..6d0a6dec 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -65,7 +65,6 @@ struct quota_res { }; struct bch_writepage_io { - struct closure cl; struct bch_inode_info *inode; /* must be last: */ @@ -73,11 +72,13 @@ struct bch_writepage_io { }; struct dio_write { - struct completion done; struct kiocb *req; + struct address_space *mapping; + struct bch_inode_info *inode; struct mm_struct *mm; unsigned loop:1, sync:1, + flush:1, free_iov:1; struct quota_res quota_res; u64 written; @@ -98,7 +99,7 @@ struct dio_read { }; /* pagecache_block must be held */ -static int write_invalidate_inode_pages_range(struct address_space *mapping, +static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, loff_t start, loff_t end) { int ret; @@ -750,25 +751,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf) if (fdm > mapping) { struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + if (bch2_pagecache_add_tryget(inode)) goto got_lock; - bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + bch2_pagecache_block_put(fdm_host); - bch2_pagecache_add_get(&inode->ei_pagecache_lock); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); - bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + bch2_pagecache_block_get(fdm_host); /* Signal that lock has been dropped: */ set_fdm_dropped_locks(); return VM_FAULT_SIGBUS; } - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); got_lock: ret = filemap_fault(vmf); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); return ret; } @@ -796,7 +797,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) * a write_invalidate_inode_pages_range() that works without dropping * page lock before invalidating page */ - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); lock_page(page); isize = i_size_read(&inode->v); @@ -829,7 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); ret = VM_FAULT_LOCKED; out: - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); sb_end_pagefault(inode->v.i_sb); return ret; @@ -1097,7 +1098,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_trans_init(&trans, c, 0, 0); - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); while ((page = readpage_iter_next(&readpages_iter))) { pgoff_t index = readpages_iter.offset + readpages_iter.idx; @@ -1120,7 +1121,7 @@ void bch2_readahead(struct readahead_control *ractl) &readpages_iter); } - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); bch2_trans_exit(&trans); kfree(readpages_iter.pages); @@ -1200,18 +1201,10 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs }; } -static void bch2_writepage_io_free(struct closure *cl) +static void bch2_writepage_io_done(struct bch_write_op *op) { - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_io_done(struct closure *cl) -{ - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); + struct bch_writepage_io *io = + container_of(op, struct bch_writepage_io, op); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; struct bvec_iter_all iter; @@ -1273,7 +1266,7 @@ static void bch2_writepage_io_done(struct closure *cl) end_page_writeback(bvec->bv_page); } - closure_return_with_destructor(&io->cl, bch2_writepage_io_free); + bio_put(&io->op.wbio.bio); } static void bch2_writepage_do_io(struct bch_writepage_state *w) @@ -1281,8 +1274,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) struct bch_writepage_io *io = w->io; w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, &io->cl); - continue_at(&io->cl, bch2_writepage_io_done, NULL); + closure_call(&io->op.cl, bch2_write, NULL, NULL); } /* @@ -1304,9 +1296,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); - closure_init(&w->io->cl, NULL); w->io->inode = inode; - op = &w->io->op; bch2_write_op_init(op, c, w->opts); op->target = w->opts.foreground_target; @@ -1315,6 +1305,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->write_point = writepoint_hashed(inode->ei_last_dirtied); op->subvol = inode->ei_subvol; op->pos = POS(inode->v.i_ino, sector); + op->end_io = bch2_writepage_io_done; op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } @@ -1438,7 +1429,8 @@ do_io: /* Check for writing past i_size: */ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c))); + round_up(i_size, block_bytes(c)) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; @@ -1490,7 +1482,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_page_reservation_init(c, inode, res); *fsdata = res; - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); page = grab_cache_page_write_begin(mapping, index); if (!page) @@ -1547,7 +1539,7 @@ err: put_page(page); *pagep = NULL; err_unlock: - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); kfree(res); *fsdata = NULL; return bch2_err_class(ret); @@ -1591,7 +1583,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); bch2_page_reservation_put(c, inode, res); kfree(res); @@ -1760,7 +1752,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ssize_t written = 0; int ret = 0; - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); do { unsigned offset = pos & (PAGE_SIZE - 1); @@ -1818,7 +1810,7 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); return written ? written : ret; } @@ -1981,11 +1973,13 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (iocb->ki_flags & IOCB_DIRECT) { struct blk_plug plug; - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; + if (unlikely(mapping->nrpages)) { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + goto out; + } file_accessed(file); @@ -1996,9 +1990,9 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret >= 0) iocb->ki_pos += ret; } else { - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); ret = generic_file_read_iter(iocb, iter); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); } out: return bch2_err_class(ret); @@ -2050,31 +2044,154 @@ err: return err ? false : ret; } +static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + return bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0); +} + static void bch2_dio_write_loop_async(struct bch_write_op *); +static __always_inline long bch2_dio_write_done(struct dio_write *dio); -static long bch2_dio_write_loop(struct dio_write *dio) +static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ + struct iovec *iov = dio->inline_vecs; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.iov = iov; + return 0; +} + +static void bch2_dio_write_flush_done(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + struct bch_fs *c = dio->op.c; + + closure_debug_destroy(cl); + + dio->op.error = bch2_journal_error(&c->journal); + + bch2_dio_write_done(dio); +} + +static noinline void bch2_dio_write_flush(struct dio_write *dio) { - bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_fs *c = dio->op.c; + struct bch_inode_unpacked inode; + int ret; + + dio->flush = 0; + + closure_init(&dio->op.cl, NULL); + + if (!dio->op.error) { + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); + if (ret) + dio->op.error = ret; + else + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); + } + + if (dio->sync) { + closure_sync(&dio->op.cl); + closure_debug_destroy(&dio->op.cl); + } else { + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); + } +} + +static __always_inline long bch2_dio_write_done(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; - struct address_space *mapping = req->ki_filp->f_mapping; - struct bch_inode_info *inode = file_bch_inode(req->ki_filp); - struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_inode_info *inode = dio->inode; + bool sync = dio->sync; + long ret; + + if (unlikely(dio->flush)) { + bch2_dio_write_flush(dio); + if (!sync) + return -EIOCBQUEUED; + } + + bch2_pagecache_block_put(inode); + bch2_quota_reservation_put(c, inode, &dio->quota_res); + + if (dio->free_iov) + kfree(dio->iter.iov); + + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); + + if (ret < 0) + ret = bch2_err_class(ret); + + if (!sync) { + req->ki_complete(req, ret); + ret = -EIOCBQUEUED; + } + return ret; +} + +static __always_inline void bch2_dio_write_end(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; struct bio *bio = &dio->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; + + i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + req->ki_pos += (u64) dio->op.written << 9; + dio->written += dio->op.written; + + spin_lock(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); + + if (unlikely(dio->op.error)) + set_bit(EI_INODE_ERROR, &inode->ei_flags); +} + +static long bch2_dio_write_loop(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = dio->mapping; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; - if (dio->loop) - goto loop; - while (1) { iter_count = dio->iter.count; - if (kthread && dio->mm) - kthread_use_mm(dio->mm); - BUG_ON(current->faults_disabled_mapping); + EBUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; ret = bio_iov_iter_get_pages(bio, &dio->iter); @@ -2082,8 +2199,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) dropped_locks = fdm_dropped_locks(); current->faults_disabled_mapping = NULL; - if (kthread && dio->mm) - kthread_unuse_mm(dio->mm); /* * If the fault handler returned an error but also signalled @@ -2121,116 +2236,80 @@ static long bch2_dio_write_loop(struct dio_write *dio) } bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); - dio->op.end_io = bch2_dio_write_loop_async; + dio->op.end_io = sync + ? NULL + : bch2_dio_write_loop_async; dio->op.target = dio->op.opts.foreground_target; dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; dio->op.subvol = inode->ei_subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - if ((req->ki_flags & IOCB_DSYNC) && - !c->opts.journal_flush_disabled) - dio->op.flags |= BCH_WRITE_FLUSH; + if (sync) + dio->op.flags |= BCH_WRITE_SYNC; dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), dio->op.opts.data_replicas, 0); if (unlikely(ret) && - !bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0)) + !bch2_dio_write_check_allocated(dio)) goto err; task_io_account_write(bio->bi_iter.bi_size); - if (!dio->sync && !dio->loop && dio->iter.count) { - struct iovec *iov = dio->inline_vecs; - - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) { - dio->sync = sync = true; - goto do_io; - } + if (unlikely(dio->iter.count) && + !dio->sync && + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; - dio->free_iov = true; - } - - memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.iov = iov; - } -do_io: dio->loop = true; closure_call(&dio->op.cl, bch2_write, NULL, NULL); - if (sync) - wait_for_completion(&dio->done); - else + if (!sync) return -EIOCBQUEUED; -loop: - i_sectors_acct(c, inode, &dio->quota_res, - dio->op.i_sectors_delta); - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); + bch2_dio_write_end(dio); - if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); - bio->bi_vcnt = 0; - - if (dio->op.error) { - set_bit(EI_INODE_ERROR, &inode->ei_flags); - break; - } - - if (!dio->iter.count) + if (likely(!dio->iter.count) || dio->op.error) break; bio_reset(bio, NULL, REQ_OP_WRITE); - reinit_completion(&dio->done); } - - ret = dio->op.error ?: ((long) dio->written << 9); +out: + return bch2_dio_write_done(dio); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - bch2_quota_reservation_put(c, inode, &dio->quota_res); + dio->op.error = ret; - if (dio->free_iov) - kfree(dio->iter.iov); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter; + struct bio_vec *bv; - if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - bio_put(bio); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); - - if (ret < 0) - ret = bch2_err_class(ret); - - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; } - return ret; + goto out; } static void bch2_dio_write_loop_async(struct bch_write_op *op) { struct dio_write *dio = container_of(op, struct dio_write, op); + struct mm_struct *mm = dio->mm; - if (dio->sync) - complete(&dio->done); - else - bch2_dio_write_loop(dio); + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) { + bch2_dio_write_done(dio); + return; + } + + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + + if (mm) + kthread_use_mm(mm); + bch2_dio_write_loop(dio); + if (mm) + kthread_unuse_mm(mm); } static noinline @@ -2268,7 +2347,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) goto err; inode_dio_begin(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); extending = req->ki_pos + iter->count > inode->v.i_size; if (!extending) { @@ -2282,26 +2361,31 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); - init_completion(&dio->done); dio->req = req; + dio->mapping = mapping; + dio->inode = inode; dio->mm = current->mm; dio->loop = false; dio->sync = is_sync_kiocb(req) || extending; + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; dio->free_iov = false; dio->quota_res.sectors = 0; dio->written = 0; dio->iter = *iter; + dio->op.c = c; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, iter->count >> 9, true); if (unlikely(ret)) goto err_put_bio; - ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; + if (unlikely(mapping->nrpages)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + } ret = bch2_dio_write_loop(dio); err: @@ -2309,7 +2393,7 @@ err: inode_unlock(&inode->v); return ret; err_put_bio: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); bch2_quota_reservation_put(c, inode, &dio->quota_res); bio_put(bio); inode_dio_end(&inode->v); @@ -2613,7 +2697,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, } inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) @@ -2692,7 +2776,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); return bch2_err_class(ret); } @@ -3005,8 +3089,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, } ret = bch2_extent_update(&trans, inode_inum(inode), &iter, - &reservation.k_i, - &disk_res, NULL, + &reservation.k_i, &disk_res, 0, &i_sectors_delta, true); if (ret) goto bkey_err; @@ -3105,7 +3188,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, inode_lock(&inode->v); inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); ret = file_modified(file); if (ret) @@ -3122,7 +3205,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else ret = -EOPNOTSUPP; err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); inode_unlock(&inode->v); percpu_ref_put(&c->writes); |