11 files changed, 172 insertions, 327 deletions
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index f23951a1..155c1ad4 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -522,6 +522,15 @@ void bch2_data_update_exit(struct data_update *update)
 	struct bch_fs *c = update->op.c;
 	struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
 
+	if (update->b)
+		atomic_dec(&update->b->count);
+
+	if (update->ctxt) {
+		scoped_guard(mutex, &update->ctxt->lock)
+			list_del(&update->io_list);
+		wake_up(&update->ctxt->wait);
+	}
+
 	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
 	kfree(update->bvecs);
 	update->bvecs = NULL;
@@ -866,8 +875,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 		: BCH_DATA_UPDATE_rebalance;
 	m->btree_id	= btree_id;
 	m->data_opts	= data_opts;
+
 	m->ctxt		= ctxt;
 	m->stats	= ctxt ? ctxt->stats : NULL;
+	INIT_LIST_HEAD(&m->read_list);
+	INIT_LIST_HEAD(&m->io_list);
 
 	bch2_write_op_init(&m->op, c, *io_opts);
 	m->op.pos	= bkey_start_pos(k.k);
@@ -927,74 +939,81 @@ int bch2_data_update_init(struct btree_trans *trans,
 		ptr_bit <<= 1;
 	}
 
-	unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
+	if (!data_opts.scrub) {
+		unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
 
-	/*
-	 * If current extent durability is less than io_opts.data_replicas,
-	 * we're not trying to rereplicate the extent up to data_replicas here -
-	 * unless extra_replicas was specified
-	 *
-	 * Increasing replication is an explicit operation triggered by
-	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
-	 */
-	m->op.nr_replicas = min(durability_removing, durability_required) +
-		m->data_opts.extra_replicas;
-
-	/*
-	 * If device(s) were set to durability=0 after data was written to them
-	 * we can end up with a duribilty=0 extent, and the normal algorithm
-	 * that tries not to increase durability doesn't work:
-	 */
-	if (!(durability_have + durability_removing))
-		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+		/*
+		 * If current extent durability is less than io_opts.data_replicas,
+		 * we're not trying to rereplicate the extent up to data_replicas here -
+		 * unless extra_replicas was specified
+		 *
+		 * Increasing replication is an explicit operation triggered by
+		 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+		 */
+		m->op.nr_replicas = min(durability_removing, durability_required) +
+			m->data_opts.extra_replicas;
 
-	m->op.nr_replicas_required = m->op.nr_replicas;
+		/*
+		 * If device(s) were set to durability=0 after data was written to them
+		 * we can end up with a duribilty=0 extent, and the normal algorithm
+		 * that tries not to increase durability doesn't work:
+		 */
+		if (!(durability_have + durability_removing))
+			m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
 
-	/*
-	 * It might turn out that we don't need any new replicas, if the
-	 * replicas or durability settings have been changed since the extent
-	 * was written:
-	 */
-	if (!m->op.nr_replicas) {
-		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
-		m->data_opts.rewrite_ptrs = 0;
-		/* if iter == NULL, it's just a promote */
-		if (iter)
-			ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
-		if (!ret)
-			ret = bch_err_throw(c, data_update_done_no_writes_needed);
-		goto out_bkey_buf_exit;
-	}
+		m->op.nr_replicas_required = m->op.nr_replicas;
 
-	/*
-	 * Check if the allocation will succeed, to avoid getting an error later
-	 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
-	 * read:
-	 *
-	 * This guards against
-	 * - BCH_WRITE_alloc_nowait allocations failing (promotes)
-	 * - Destination target full
-	 * - Device(s) in destination target offline
-	 * - Insufficient durability available in destination target
-	 *   (i.e. trying to move a durability=2 replica to a target with a
-	 *   single durability=2 device)
-	 */
-	ret = can_write_extent(c, m);
-	if (ret)
-		goto out_bkey_buf_exit;
+		/*
+		 * It might turn out that we don't need any new replicas, if the
+		 * replicas or durability settings have been changed since the extent
+		 * was written:
+		 */
+		if (!m->op.nr_replicas) {
+			m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+			m->data_opts.rewrite_ptrs = 0;
+			/* if iter == NULL, it's just a promote */
+			if (iter)
+				ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
+			if (!ret)
+				ret = bch_err_throw(c, data_update_done_no_writes_needed);
+			goto out;
+		}
 
-	if (reserve_sectors) {
-		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
-				m->data_opts.extra_replicas
-				? 0
-				: BCH_DISK_RESERVATION_NOFAIL);
+		/*
+		 * Check if the allocation will succeed, to avoid getting an error later
+		 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
+		 * read:
+		 *
+		 * This guards against
+		 * - BCH_WRITE_alloc_nowait allocations failing (promotes)
+		 * - Destination target full
+		 * - Device(s) in destination target offline
+		 * - Insufficient durability available in destination target
+		 *   (i.e. trying to move a durability=2 replica to a target with a
+		 *   single durability=2 device)
+		 */
+		ret = can_write_extent(c, m);
 		if (ret)
-			goto out_bkey_buf_exit;
+			goto out;
+
+		if (reserve_sectors) {
+			ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+					m->data_opts.extra_replicas
+					? 0
+					: BCH_DISK_RESERVATION_NOFAIL);
+			if (ret)
+				goto out;
+		}
+	} else {
+		if (unwritten) {
+			ret = bch_err_throw(c, data_update_done_unwritten);
+			goto out;
+		}
 	}
 
 	if (!bkey_get_dev_refs(c, k)) {
 		ret = bch_err_throw(c, data_update_done_no_dev_refs);
-		goto out_put_disk_res;
+		goto out;
 	}
 
 	if (c->opts.nocow_enabled &&
@@ -1021,10 +1040,8 @@ out_nocow_unlock:
 		bkey_nocow_unlock(c, k);
 out_put_dev_refs:
 	bkey_put_dev_refs(c, k);
-out_put_disk_res:
+out:
 	bch2_disk_reservation_put(c, &m->op.res);
-out_bkey_buf_exit:
-	bch2_bkey_buf_exit(&m->k, c);
 	return ret;
 }
 
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
index 3b0ba6f6..0e93b518 100644
--- a/libbcachefs/data_update.h
+++ b/libbcachefs/data_update.h
@@ -43,6 +43,11 @@ struct data_update {
 	enum btree_id		btree_id;
 	struct bkey_buf		k;
 	struct data_update_opts	data_opts;
+
+	/* associated with @ctxt */
+	struct list_head	read_list;
+	struct list_head	io_list;
+	struct move_bucket	*b;
 	struct moving_context	*ctxt;
 	struct bch_move_stats	*stats;
 
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index fe684adc..bfa1307b 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -729,134 +729,6 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
 
 /* buffered writes: */
 
-int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
-		     loff_t pos, unsigned len,
-		     struct folio **foliop, void **fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res;
-	struct folio *folio;
-	unsigned offset;
-	int ret = -ENOMEM;
-
-	res = kmalloc(sizeof(*res), GFP_KERNEL);
-	if (!res)
-		return -ENOMEM;
-
-	bch2_folio_reservation_init(c, inode, res);
-	*fsdata = res;
-
-	bch2_pagecache_add_get(inode);
-
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-				    FGP_WRITEBEGIN | fgf_set_order(len),
-				    mapping_gfp_mask(mapping));
-	if (IS_ERR(folio))
-		goto err_unlock;
-
-	offset = pos - folio_pos(folio);
-	len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-	if (folio_test_uptodate(folio))
-		goto out;
-
-	/* If we're writing entire folio, don't need to read it in first: */
-	if (!offset && len == folio_size(folio))
-		goto out;
-
-	if (!offset && pos + len >= inode->v.i_size) {
-		folio_zero_segment(folio, len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-
-	if (folio_pos(folio) >= inode->v.i_size) {
-		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-readpage:
-	ret = bch2_read_single_folio(folio, mapping);
-	if (ret)
-		goto err;
-out:
-	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-	if (ret) {
-		if (!folio_test_uptodate(folio)) {
-			/*
-			 * If the folio hasn't been read in, we won't know if we
-			 * actually need a reservation - we don't actually need
-			 * to read here, we just need to check if the folio is
-			 * fully backed by uncompressed data:
-			 */
-			goto readpage;
-		}
-
-		goto err;
-	}
-
-	*foliop = folio;
-	return 0;
-err:
-	folio_unlock(folio);
-	folio_put(folio);
-err_unlock:
-	bch2_pagecache_add_put(inode);
-	kfree(res);
-	*fsdata = NULL;
-	return bch2_err_class(ret);
-}
-
-int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
-		   loff_t pos, unsigned len, unsigned copied,
-		   struct folio *folio, void *fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res = fsdata;
-	unsigned offset = pos - folio_pos(folio);
-
-	BUG_ON(offset + copied > folio_size(folio));
-
-	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-		/*
-		 * The folio needs to be read in, but that would destroy
-		 * our partial write - simplest thing is to just force
-		 * userspace to redo the write:
-		 */
-		folio_zero_range(folio, 0, folio_size(folio));
-		flush_dcache_folio(folio);
-		copied = 0;
-	}
-
-	scoped_guard(spinlock, &inode->v.i_lock)
-		if (pos + copied > inode->v.i_size)
-			i_size_write(&inode->v, pos + copied);
-
-	if (copied) {
-		if (!folio_test_uptodate(folio))
-			folio_mark_uptodate(folio);
-
-		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-		inode->ei_last_dirtied = (unsigned long) current;
-	}
-
-	folio_unlock(folio);
-	folio_put(folio);
-	bch2_pagecache_add_put(inode);
-
-	bch2_folio_reservation_put(c, inode, res);
-	kfree(res);
-
-	return copied;
-}
-
 static noinline void folios_trunc(folios *fs, struct folio **fi)
 {
 	while (fs->data + fs->nr > fi) {
diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h
index 14de91c2..df59398b 100644
--- a/libbcachefs/fs-io-buffered.h
+++ b/libbcachefs/fs-io-buffered.h
@@ -10,11 +10,6 @@ int bch2_read_folio(struct file *, struct folio *);
 int bch2_writepages(struct address_space *, struct writeback_control *);
 void bch2_readahead(struct readahead_control *);
 
-int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
-		     unsigned len, struct folio **, void **);
-int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
-		   unsigned len, unsigned copied, struct folio *, void *);
-
 ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
 
 void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index d6a2031e..9b309ea6 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -44,6 +44,7 @@
 #include <linux/siphash.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
+#include <linux/version.h>
 #include <linux/xattr.h>
 
 static struct kmem_cache *bch2_inode_cache;
@@ -1585,6 +1586,10 @@ static const __maybe_unused unsigned bch_flags_to_xflags[] = {
 	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME,
 };
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6,17,0)
+#define file_kattr fileattr
+#endif
+
 static int bch2_fileattr_get(struct dentry *dentry,
 			     struct file_kattr *fa)
 {
@@ -1803,8 +1808,6 @@ static const struct address_space_operations bch_address_space_operations = {
 	.writepages	= bch2_writepages,
 	.readahead	= bch2_readahead,
 	.dirty_folio	= filemap_dirty_folio,
-	.write_begin	= bch2_write_begin,
-	.write_end	= bch2_write_end,
 	.invalidate_folio = bch2_invalidate_folio,
 	.release_folio	= bch2_release_folio,
 #ifdef CONFIG_MIGRATION
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 8b4cda1d..ca480b8f 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -327,6 +327,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
 
 	return &op->write.rbio;
 err_remove_list:
+	bch2_bkey_buf_exit(&op->write.k, c);
 	async_object_list_del(c, promote, op->list_idx);
 err_remove_hash:
 	BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 6505c79f..9058df47 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1126,6 +1126,12 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
 		ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
 					       BCH_DATA_journal, cl);
 		ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+
+		if (ret == -BCH_ERR_bucket_alloc_blocked)
+			ret = bch_err_throw(c, freelist_empty);
+		if (ret == -BCH_ERR_freelist_empty) /* don't if we're actually out of buckets */
+			closure_wake_up(&c->freelist_wait);
+
 		if (ret)
 			break;
 
@@ -1258,9 +1264,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca
 		}
 
 		ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
-
-		if (ret == -BCH_ERR_bucket_alloc_blocked ||
-		    ret == -BCH_ERR_open_buckets_empty)
+		if (ret == -BCH_ERR_open_buckets_empty)
 			ret = 0; /* wait and retry */
 
 		bch2_disk_reservation_put(c, &disk_res);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 92edff50..139a6587 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -19,6 +19,7 @@
 #include "migrate.h"
 #include "move.h"
 #include "progress.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super-io.h"
 
@@ -79,7 +80,12 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false);
+	enum set_needs_rebalance_ctx ctx = SET_NEEDS_REBALANCE_opt_change;
+	struct bch_inode_opts opts;
+
+	ret =   bch2_extent_get_apply_io_opts_one(trans, &opts, iter, k, ctx) ?:
+		bch2_bkey_set_needs_rebalance(c, &opts, n, ctx, 0) ?:
+		drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false);
 	if (ret)
 		return ret;
 
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 9a440d3f..63c8f57b 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -105,46 +105,11 @@ trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
 	printbuf_exit(&buf);
 }
 
-struct moving_io {
-	struct list_head		read_list;
-	struct list_head		io_list;
-	struct move_bucket		*b;
-	struct closure			cl;
-	bool				read_completed;
-
-	unsigned			read_sectors;
-	unsigned			write_sectors;
-
-	struct data_update		write;
-};
-
-static void move_free(struct moving_io *io)
-{
-	struct moving_context *ctxt = io->write.ctxt;
-	struct bch_fs *c = io->write.op.c;
-
-	if (io->b)
-		atomic_dec(&io->b->count);
-
-	scoped_guard(mutex, &ctxt->lock)
-		list_del(&io->io_list);
-	wake_up(&ctxt->wait);
-
-	if (!io->write.data_opts.scrub) {
-		bch2_data_update_exit(&io->write);
-	} else {
-		bch2_bio_free_pages_pool(c, &io->write.op.wbio.bio);
-		kfree(io->write.bvecs);
-		bch2_bkey_buf_exit(&io->write.k, c);
-	}
-	kfree(io);
-}
-
 static void move_write_done(struct bch_write_op *op)
 {
-	struct moving_io *io = container_of(op, struct moving_io, write.op);
+	struct data_update *u = container_of(op, struct data_update, op);
 	struct bch_fs *c = op->c;
-	struct moving_context *ctxt = io->write.ctxt;
+	struct moving_context *ctxt = u->ctxt;
 
 	if (op->error) {
 		if (trace_io_move_write_fail_enabled()) {
@@ -157,24 +122,25 @@ static void move_write_done(struct bch_write_op *op)
 		ctxt->write_error = true;
 	}
 
-	atomic_sub(io->write_sectors, &ctxt->write_sectors);
+	atomic_sub(u->k.k->k.size, &ctxt->write_sectors);
 	atomic_dec(&ctxt->write_ios);
-	move_free(io);
+	bch2_data_update_exit(u);
+	kfree(u);
 	closure_put(&ctxt->cl);
 }
 
-static void move_write(struct moving_io *io)
+static void move_write(struct data_update *u)
 {
-	struct bch_fs *c = io->write.op.c;
-	struct moving_context *ctxt = io->write.ctxt;
-	struct bch_read_bio *rbio = &io->write.rbio;
+	struct bch_fs *c = u->op.c;
+	struct moving_context *ctxt = u->ctxt;
+	struct bch_read_bio *rbio = &u->rbio;
 
 	if (ctxt->stats) {
 		if (rbio->bio.bi_status)
-			atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+			atomic64_add(u->rbio.bvec_iter.bi_size >> 9,
 				     &ctxt->stats->sectors_error_uncorrected);
 		else if (rbio->saw_error)
-			atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+			atomic64_add(u->rbio.bvec_iter.bi_size >> 9,
 				     &ctxt->stats->sectors_error_corrected);
 	}
 
@@ -184,7 +150,7 @@ static void move_write(struct moving_io *io)
 	 * that userspace still gets the appropriate error.
 	 */
 	if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
-		     (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
+		     (bch2_bkey_extent_flags(bkey_i_to_s_c(u->k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
 		struct bch_extent_crc_unpacked crc = rbio->pick.crc;
 		struct nonce nonce = extent_nonce(rbio->version, crc);
 
@@ -193,40 +159,41 @@ static void move_write(struct moving_io *io)
 		rbio->ret		= 0;
 	}
 
-	if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
-		move_free(io);
+	if (unlikely(rbio->ret || u->data_opts.scrub)) {
+		bch2_data_update_exit(u);
+		kfree(u);
 		return;
 	}
 
 	if (trace_io_move_write_enabled()) {
 		CLASS(printbuf, buf)();
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(u->k.k));
 		trace_io_move_write(c, buf.buf);
 	}
 
-	closure_get(&io->write.ctxt->cl);
-	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-	atomic_inc(&io->write.ctxt->write_ios);
+	closure_get(&ctxt->cl);
+	atomic_add(u->k.k->k.size, &ctxt->write_sectors);
+	atomic_inc(&ctxt->write_ios);
 
-	bch2_data_update_read_done(&io->write);
+	bch2_data_update_read_done(u);
 }
 
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
+struct data_update *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
 {
-	struct moving_io *io =
-		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
+	struct data_update *u =
+		list_first_entry_or_null(&ctxt->reads, struct data_update, read_list);
 
-	return io && io->read_completed ? io : NULL;
+	return u && u->read_done ? u : NULL;
 }
 
 static void move_read_endio(struct bio *bio)
 {
-	struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
-	struct moving_context *ctxt = io->write.ctxt;
+	struct data_update *u = container_of(bio, struct data_update, rbio.bio);
+	struct moving_context *ctxt = u->ctxt;
 
-	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	atomic_sub(u->k.k->k.size, &ctxt->read_sectors);
 	atomic_dec(&ctxt->read_ios);
-	io->read_completed = true;
+	u->read_done = true;
 
 	wake_up(&ctxt->wait);
 	closure_put(&ctxt->cl);
@@ -234,12 +201,12 @@ static void move_read_endio(struct bio *bio)
 
 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
 {
-	struct moving_io *io;
+	struct data_update *u;
 
-	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+	while ((u = bch2_moving_ctxt_next_pending_write(ctxt))) {
 		bch2_trans_unlock_long(ctxt->trans);
-		list_del(&io->read_list);
-		move_write(io);
+		list_del(&u->read_list);
+		move_write(u);
 	}
 }
 
@@ -355,64 +322,44 @@ int bch2_move_extent(struct moving_context *ctxt,
 		}
 	}
 
-	struct moving_io *io = allocate_dropping_locks(trans, ret,
-				kzalloc(sizeof(struct moving_io), _gfp));
-	if (!io && !ret)
+	struct data_update *u = allocate_dropping_locks(trans, ret,
+				kzalloc(sizeof(struct data_update), _gfp));
+	if (!u && !ret)
 		ret = bch_err_throw(c, ENOMEM_move_extent);
 	if (ret)
 		goto err;
 
-	INIT_LIST_HEAD(&io->io_list);
-	io->write.ctxt		= ctxt;
-	io->read_sectors	= k.k->size;
-	io->write_sectors	= k.k->size;
-
-	if (!data_opts.scrub) {
-		ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
-					    &io_opts, data_opts, iter->btree_id, k);
-		if (ret)
-			goto err;
-
-		io->write.op.end_io	= move_write_done;
-	} else {
-		bch2_bkey_buf_init(&io->write.k);
-		bch2_bkey_buf_reassemble(&io->write.k, c, k);
-
-		io->write.op.c		= c;
-		io->write.data_opts	= data_opts;
-
-		bch2_trans_unlock(trans);
-
-		ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
-		if (ret)
-			goto err;
-	}
+	ret = bch2_data_update_init(trans, iter, ctxt, u, ctxt->wp,
+				    &io_opts, data_opts, iter->btree_id, k);
+	if (ret)
+		goto err;
 
-	io->write.rbio.bio.bi_end_io = move_read_endio;
-	io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
+	u->op.end_io		= move_write_done;
+	u->rbio.bio.bi_end_io	= move_read_endio;
+	u->rbio.bio.bi_ioprio	= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
 
 	if (ctxt->rate)
 		bch2_ratelimit_increment(ctxt->rate, k.k->size);
 
 	if (ctxt->stats) {
 		atomic64_inc(&ctxt->stats->keys_moved);
-		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+		atomic64_add(u->k.k->k.size, &ctxt->stats->sectors_moved);
 	}
 
 	if (bucket_in_flight) {
-		io->b = bucket_in_flight;
-		atomic_inc(&io->b->count);
+		u->b = bucket_in_flight;
+		atomic_inc(&u->b->count);
 	}
 
 	if (trace_io_move_read_enabled())
 		trace_io_move_read2(c, k);
 
 	scoped_guard(mutex, &ctxt->lock) {
-		atomic_add(io->read_sectors, &ctxt->read_sectors);
+		atomic_add(u->k.k->k.size, &ctxt->read_sectors);
 		atomic_inc(&ctxt->read_ios);
 
-		list_add_tail(&io->read_list, &ctxt->reads);
-		list_add_tail(&io->io_list, &ctxt->ios);
+		list_add_tail(&u->read_list, &ctxt->reads);
+		list_add_tail(&u->io_list, &ctxt->ios);
 	}
 
 	/*
@@ -420,8 +367,8 @@ int bch2_move_extent(struct moving_context *ctxt,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	__bch2_read_extent(trans, &io->write.rbio,
-			   io->write.rbio.bio.bi_iter,
+	__bch2_read_extent(trans, &u->rbio,
+			   u->rbio.bio.bi_iter,
 			   bkey_start_pos(k.k),
 			   iter->btree_id, k, 0,
 			   NULL,
@@ -429,23 +376,22 @@ int bch2_move_extent(struct moving_context *ctxt,
 			   data_opts.scrub ?  data_opts.read_dev : -1);
 	return 0;
 err:
-	bch2_bkey_buf_exit(&io->write.k, c);
-	kfree(io);
-
-	if (bch2_err_matches(ret, EROFS) ||
-	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		return ret;
+	if (!bch2_err_matches(ret, EROFS) &&
+	    !bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		count_event(c, io_move_start_fail);
 
-	count_event(c, io_move_start_fail);
-
-	if (trace_io_move_start_fail_enabled()) {
-		CLASS(printbuf, buf)();
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_str(&buf, ": ");
-		prt_str(&buf, bch2_err_str(ret));
-		trace_io_move_start_fail(c, buf.buf);
+		if (trace_io_move_start_fail_enabled()) {
+			CLASS(printbuf, buf)();
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(u->k.k));
+			prt_str(&buf, ": ");
+			prt_str(&buf, bch2_err_str(ret));
+			trace_io_move_start_fail(c, buf.buf);
+		}
 	}
 
+	bch2_bkey_buf_exit(&u->k, c);
+	kfree(u);
+
 	if (bch2_err_matches(ret, BCH_ERR_data_update_done))
 		return 0;
 	return ret;
@@ -1301,9 +1247,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
 	guard(printbuf_indent)(out);
 
 	scoped_guard(mutex, &ctxt->lock) {
-		struct moving_io *io;
-		list_for_each_entry(io, &ctxt->ios, io_list)
-			bch2_data_update_inflight_to_text(out, &io->write);
+		struct data_update *u;
+		list_for_each_entry(u, &ctxt->ios, io_list)
+			bch2_data_update_inflight_to_text(out, u);
 	}
 }
 
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 754b0ad4..62831014 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -81,7 +81,7 @@ void bch2_moving_ctxt_exit(struct moving_context *);
 void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
 			   struct bch_ratelimit *, struct bch_move_stats *,
 			   struct write_point_specifier, bool);
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+struct data_update *bch2_moving_ctxt_next_pending_write(struct moving_context *);
 void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
 void bch2_moving_ctxt_flush_all(struct moving_context *);
 void bch2_move_ctxt_wait_for_io(struct moving_context *);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 4b873694..5cd308a6 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -2012,13 +2012,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
 	 */
 	bch2_dev_put(ca);
 
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) {
-		prt_printf(err, "Cannot remove without losing data\n");
-		ret = bch_err_throw(c, device_state_not_allowed);
+	ret = __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_failed, flags, err);
+	if (ret)
 		goto err;
-	}
-
-	__bch2_dev_read_only(c, ca);
 
 	ret = fast_device_removal
 		? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags, err)