7 files changed, 97 insertions, 44 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 41c6d8865a74..fb3156ed7f0b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1288,7 +1288,7 @@ static inline int bch2_fs_casefold_enabled(struct bch_fs *c)
 {
 	if (!IS_ENABLED(CONFIG_UNICODE))
 		return bch_err_throw(c, no_casefolding_without_utf8);
-	if (!c->opts.casefold_disabled)
+	if (c->opts.casefold_disabled)
 		return bch_err_throw(c, casefolding_disabled);
 	return 0;
 }
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..84e302afc8fc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_node_reset_sib_u64s(b);
 
-	scoped_guard(rcu)
-		bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-			struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
-			if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
-				set_btree_node_need_rewrite(b);
-				set_btree_node_need_rewrite_degraded(b);
+	/*
+	 * XXX:
+	 *
+	 * We deadlock if too many btree updates require node rewrites while
+	 * we're still in journal replay.
+	 *
+	 * This is because btree node rewrites generate more updates for the
+	 * interior updates (alloc, backpointers), and if those updates touch
+	 * new nodes and generate more rewrites - well, you see the problem.
+	 *
+	 * The biggest cause is that we don't use the btree write buffer (for
+	 * the backpointer updates - this needs some real thought on locking in
+	 * order to fix.
+	 *
+	 * The problem with this workaround (not doing the rewrite for degraded
+	 * nodes in journal replay) is that those degraded nodes persist, and we
+	 * don't want that (this is a real bug when a btree node write completes
+	 * with fewer replicas than we wanted and leaves a degraded node due to
+	 * device _removal_, i.e. the device went away mid write).
+	 *
+	 * It's less of a bug here, but still a problem because we don't yet
+	 * have a way of tracking degraded data - we another index (all
+	 * extents/btree nodes, by replicas entry) in order to fix properly
+	 * (re-replicate degraded data at the earliest possible time).
+	 */
+	if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+		scoped_guard(rcu)
+			bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+				struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+				if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+					set_btree_node_need_rewrite(b);
+					set_btree_node_need_rewrite_degraded(b);
+				}
 			}
-		}
+	}
 
 	if (!ptr_written) {
 		set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f9bc99eb2d02..3b0783f117ae 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1692,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
 
 		s.mask = map_defined(bch_flags_to_xflags);
 		s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
-		if (fa->fsx_xflags)
-			return bch_err_throw(c, unsupported_fsx_flag);
+		if (fa->fsx_xflags) {
+			ret = bch_err_throw(c, unsupported_fsx_flag);
+			goto err;
+		}
 
-		if (fa->fsx_projid >= U32_MAX)
-			return bch_err_throw(c, projid_too_big);
+		if (fa->fsx_projid >= U32_MAX) {
+			ret = bch_err_throw(c, projid_too_big);
+			goto err;
+		}
 
 		/*
 		 * inode fields accessible via the xattr interface are stored with a +1
@@ -1718,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
 		fa->flags &= ~FS_CASEFOLD_FL;
 
 		s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
-		if (fa->flags)
-			return bch_err_throw(c, unsupported_fa_flag);
+		if (fa->flags) {
+			ret = bch_err_throw(c, unsupported_fa_flag);
+			goto err;
+		}
 	}
 
 	mutex_lock(&inode->ei_update_lock);
@@ -1730,7 +1736,7 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
 		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
 			       ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
-
+err:
 	return bch2_err_class(ret);
 }
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index dd3f3434c1b0..f3cf48193398 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1272,6 +1272,34 @@ static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_r
 	printbuf_exit(&buf);
 }
 
+struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end)
+{
+	BUG_ON(start > end);
+
+	if (start == end)
+		return (struct u64_range) {};
+
+	while (start < end &&
+	       bch2_journal_seq_is_blacklisted(c, start, false))
+		start++;
+
+	if (start == end)
+		return (struct u64_range) {};
+
+	struct u64_range missing = { .start = start };
+
+	while (start < end &&
+	       !bch2_journal_seq_is_blacklisted(c, start, false))
+		start++;
+
+	missing.end = start - 1;
+
+	if (missing.start == missing.end)
+		return (struct u64_range) {};
+
+	return missing;
+}
+
 noinline_for_stack
 static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
 {
@@ -1290,25 +1318,12 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e
 
 		BUG_ON(seq > le64_to_cpu(i->j.seq));
 
-		while (seq < le64_to_cpu(i->j.seq)) {
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			if (seq == le64_to_cpu(i->j.seq))
-				break;
-
-			u64 missing_start = seq;
-
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       !bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			u64 missing_end = seq - 1;
+		struct u64_range missing;
 
+		while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) {
 			printbuf_reset(&buf);
 			prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-				   missing_start, missing_end,
+				   missing.start, missing.end,
 				   start_seq, end_seq);
 
 			prt_printf(&buf, "\nprev at ");
@@ -1323,6 +1338,8 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e
 			prt_printf(&buf, ", continue?");
 
 			fsck_err(c, journal_entries_missing, "%s", buf.buf);
+
+			seq = missing.end + 1;
 		}
 
 		prev = i;
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6fa82c4050fe..f53c5c81d137 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -71,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			       struct journal_replay *);
 
+struct u64_range {
+	u64	start;
+	u64	end;
+};
+
+struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64);
+
 int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 CLOSURE_CALLBACK(bch2_journal_write);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7340c1118579..6980cd5b0ca8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1024,13 +1024,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 			goto err;
 	}
 
-#if !IS_ENABLED(CONFIG_UNICODE)
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
-		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
-		ret = -EINVAL;
-		goto err;
-	}
-#endif
+#if IS_ENABLED(CONFIG_UNICODE)
 	if (!bch2_fs_casefold_enabled(c)) {
 		/* Default encoding until we can potentially have more as an option. */
 		c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
@@ -1043,6 +1037,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 			goto err;
 		}
 	}
+#else
+	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
+		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
+		ret = -EINVAL;
+		goto err;
+	}
+#endif
 
 	for (i = 0; i < c->sb.nr_devices; i++) {
 		if (!bch2_member_exists(c->disk_sb.sb, i))
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 05b40debf211..7a4436fd4441 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -299,17 +299,12 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne
 	if (ret)
 		return ret;
 
-	if (!down_read_trylock(&task->signal->exec_update_lock))
-		return -1;
-
 	do {
 		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
 	} while (nr_entries == stack->size &&
 		 !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
 
 	stack->nr = nr_entries;
-	up_read(&task->signal->exec_update_lock);
-
 	return ret;
 #else
 	return 0;