1 files changed, 190 insertions, 160 deletions
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 6fdbb464..b73002de 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 		      stats.online_reserved);
 }
 
+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+	struct bch_dev_usage stats =
+		__bch2_dev_usage_read(ca);
+	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	BUG_ON(stats.buckets[S_META]		> n);
+	BUG_ON(stats.buckets[S_DIRTY]		> n);
+	BUG_ON(stats.buckets_cached		> n);
+	BUG_ON(stats.buckets_alloc		> n);
+	BUG_ON(stats.buckets_unavailable	> n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		u64 used = __bch2_fs_sectors_used(c);
+		u64 cached = 0;
+		u64 avail = atomic64_read(&c->sectors_available);
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+		if (used + avail + cached > c->capacity)
+			panic("used %llu avail %llu cached %llu capacity %llu\n",
+			      used, avail, cached, c->capacity);
+	}
+}
+
 #else
 
 static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
 
 #endif
 
@@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
 	return bch2_usage_read_raw(ca->usage_percpu);
 }
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_cached(ca->fs,
-				ca->usage_cached,
-				ca->usage_percpu);
+	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
 }
 
 struct bch_fs_usage
@@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m)
 		!m.dirty_sectors && !!m.cached_sectors;
 }
 
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+	return !is_available_bucket(m);
+}
+
 static inline enum s_alloc bucket_type(struct bucket_mark m)
 {
 	return is_meta_bucket(m) ? S_META : S_DIRTY;
@@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	memset(stats, 0, sizeof(*stats));
 }
 
-static void bch2_dev_usage_update(struct bch_dev *ca,
-				  struct bucket_mark old, struct bucket_mark new)
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bucket *g, struct bucket_mark old,
+				  struct bucket_mark new)
 {
-	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage *dev_usage;
 
+	BUG_ON((g - ca->buckets) < ca->mi.first_bucket ||
+	       (g - ca->buckets) >= ca->mi.nbuckets);
+
 	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
 			old.data_type != new.data_type, c,
 			"different types of metadata in same bucket: %u, %u",
@@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage_percpu);
 
-	dev_usage->sectors_cached +=
-		(int) new.cached_sectors - (int) old.cached_sectors;
-
-	dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
-	dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
-
+	dev_usage->buckets[S_META] +=
+		is_meta_bucket(new) - is_meta_bucket(old);
+	dev_usage->buckets[S_DIRTY] +=
+		is_dirty_bucket(new) - is_dirty_bucket(old);
+	dev_usage->buckets_cached +=
+		is_cached_bucket(new) - is_cached_bucket(old);
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_unavailable +=
+		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
-	dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
-	dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
-	dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
+	dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
+	dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
+	dev_usage->sectors_cached +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
 	preempt_enable();
 
 	if (!is_available_bucket(old) && is_available_bucket(new))
 		bch2_wake_allocator(ca);
+
+	bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(ca, g, new, expr)			\
+#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(ca, _old, new);			\
+	bch2_dev_usage_update(c, ca, g, _old, new);		\
 	_old;							\
 })
 
-bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
-			    struct bucket_mark *old)
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    struct bucket *g, struct bucket_mark *old)
 {
 	struct bucket_mark new;
 
-	*old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	*old = bucket_data_cmpxchg(c, ca, g, new, ({
 		if (!is_available_bucket(new))
 			return false;
 
@@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 		new.dirty_sectors	= 0;
 		new.gen++;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
@@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 	return true;
 }
 
-bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
+				    struct bucket *g)
 {
 	struct bucket_mark new, old;
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		if (new.touched_this_mount ||
 		    !is_available_bucket(new))
 			return false;
@@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
 		new.owned_by_allocator	= 1;
 		new.touched_this_mount	= 1;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	return true;
 }
 
-void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    struct bucket *g, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
 {
 	struct bucket_mark old, new;
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
-		new.touched_this_mount	= 1;
-		new.owned_by_allocator	= 0;
-		new.data_type		= 0;
-		new.cached_sectors	= 0;
-		new.dirty_sectors	= 0;
-	}));
-
-	BUG_ON(bucket_became_unavailable(ca->fs, old, new));
-}
-
-void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
-			   bool owned_by_allocator)
-{
-	struct bucket_mark old, new;
+	lg_local_lock(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos)) {
+		lg_local_unlock(&c->usage_lock);
+		return;
+	}
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		new.touched_this_mount	= 1;
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       ca->fs->gc_pos.phase == GC_PHASE_DONE);
+	       c->gc_pos.phase == GC_PHASE_DONE);
 }
 
 #define saturated_add(ca, dst, src, max)			\
@@ -377,41 +419,49 @@ do {								\
 	}							\
 } while (0)
 
-void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
-			       enum bucket_data_type type,
-			       bool may_make_unavailable)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       struct bucket *g, enum bucket_data_type type,
+			       struct gc_pos pos, unsigned flags)
 {
 	struct bucket_mark old, new;
 
 	BUG_ON(!type);
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos)) {
+		lg_local_unlock(&c->usage_lock);
+		return;
+	}
+
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
 			      GC_MAX_SECTORS_USED);
 		new.data_type		= type;
 		new.touched_this_mount	= 1;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	if (old.data_type != type &&
 	    (old.data_type ||
 	     old.cached_sectors ||
 	     old.dirty_sectors))
-		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+		bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);
 
-	BUG_ON(!may_make_unavailable &&
-	       bucket_became_unavailable(ca->fs, old, new));
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
 }
 
 /* Reverting this until the copygc + compression issue is fixed: */
 
-static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 {
 	if (!sectors)
 		return 0;
 
-	return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
-				    crc_uncompressed_size(NULL, crc)));
+	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+				    crc.uncompressed_size));
 }
 
 /*
@@ -420,12 +470,12 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
  * that with the gc pos seqlock held.
  */
 static void bch2_mark_pointer(struct bch_fs *c,
-			     struct bkey_s_c_extent e,
-			     const union bch_extent_crc *crc,
-			     const struct bch_extent_ptr *ptr,
-			     s64 sectors, enum s_alloc type,
-			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags)
+			      struct bkey_s_c_extent e,
+			      const struct bch_extent_ptr *ptr,
+			      struct bch_extent_crc_unpacked crc,
+			      s64 sectors, enum s_alloc type,
+			      struct bch_fs_usage *stats,
+			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
 	unsigned saturated;
@@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		? BUCKET_BTREE : BUCKET_DATA;
 	u64 v;
 
-	if (crc_compression_type(crc)) {
+	if (crc.compression_type) {
 		unsigned old_sectors, new_sectors;
 
 		if (sectors > 0) {
@@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.counter,
 			      new.counter)) != old.counter);
 
-	bch2_dev_usage_update(ca, old, new);
+	bch2_dev_usage_update(c, ca, g, old, new);
 
 	if (old.data_type != data_type &&
 	    (old.data_type ||
 	     old.cached_sectors ||
 	     old.dirty_sectors))
-		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+		bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
@@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	}
 }
 
-static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
-			    s64 sectors, bool metadata,
-			    struct bch_fs_usage *stats,
-			    u64 journal_seq, unsigned flags)
-{
-	const struct bch_extent_ptr *ptr;
-	const union bch_extent_crc *crc;
-	enum s_alloc type = metadata ? S_META : S_DIRTY;
-	unsigned replicas = 0;
-
-	BUG_ON(metadata && bkey_extent_is_cached(e.k));
-	BUG_ON(!sectors);
-
-	extent_for_each_ptr_crc(e, ptr, crc) {
-		bch2_mark_pointer(c, e, crc, ptr, sectors, type,
-				  stats, journal_seq, flags);
-		replicas += !ptr->cached;
-	}
-
-	BUG_ON(replicas >= BCH_REPLICAS_MAX);
-
-	if (replicas)
-		stats->s[replicas - 1].data[type] += sectors;
-}
-
-void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata,
-		     struct bch_fs_usage *stats,
-		     u64 journal_seq, unsigned flags)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
-				stats, journal_seq, flags);
-		break;
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-		if (r.v->nr_replicas)
-			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-		break;
-	}
-	}
-}
-
-void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata, unsigned flags)
-{
-	struct bch_fs_usage stats = { 0 };
-
-	__bch2_mark_key(c, k, sectors, metadata, &stats, 0,
-			flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-	preempt_disable();
-	bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
-	preempt_enable();
-}
-
 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		  s64 sectors, bool metadata, struct gc_pos gc_pos,
-		  struct bch_fs_usage *stats, u64 journal_seq)
+		   s64 sectors, bool metadata,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
 {
-	unsigned flags = gc_will_visit(c, gc_pos)
-		? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
 	/*
 	 * synchronization w.r.t. GC:
 	 *
@@ -614,69 +605,104 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	 * To know whether we should mark a given reference (GC either isn't
 	 * running, or has already marked references at this position) we
 	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @gc_pos - with
+	 * compare the position of the reference we're marking - @pos - with
 	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @gc_pos; if GC's current position
-	 * is greater than @gc_pos GC has either already walked this position,
-	 * or isn't running.
+	 * current position will be less than @pos; if GC's current position is
+	 * greater than @pos GC has either already walked this position, or
+	 * isn't running.
 	 *
 	 * To avoid racing with GC's position changing, we have to deal with
 	 *  - GC's position being set to GC_POS_MIN when GC starts:
 	 *    usage_lock guards against this
-	 *  - GC's position overtaking @gc_pos: we guard against this with
+	 *  - GC's position overtaking @pos: we guard against this with
 	 *    whatever lock protects the data structure the reference lives in
 	 *    (e.g. the btree node lock, or the relevant allocator lock).
 	 */
+
 	lg_local_lock(&c->usage_lock);
-	__bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
-	bch2_fs_stats_verify(c);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		enum s_alloc type = metadata ? S_META : S_DIRTY;
+		unsigned replicas = 0;
+
+		BUG_ON(metadata && bkey_extent_is_cached(e.k));
+		BUG_ON(!sectors);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+					  stats, journal_seq, flags);
+			replicas += !ptr->cached;
+		}
+
+		BUG_ON(replicas >= BCH_REPLICAS_MAX);
+
+		if (replicas)
+			stats->s[replicas - 1].data[type] += sectors;
+		break;
+	}
+	case BCH_RESERVATION: {
+		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+		if (r.v->nr_replicas)
+			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+		break;
+	}
+	}
 	lg_local_unlock(&c->usage_lock);
 }
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
-{
-	return c->capacity - bch2_fs_sectors_used(c);
-}
+/* Disk reservations: */
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
+static u64 __recalc_sectors_available(struct bch_fs *c)
 {
+	u64 avail;
 	int cpu;
 
-	lg_global_lock(&c->usage_lock);
-
 	for_each_possible_cpu(cpu)
 		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 
-	atomic64_set(&c->sectors_available,
-		     __recalc_sectors_available(c));
+	avail = c->capacity - bch2_fs_sectors_used(c);
 
+	avail <<= RESERVE_FACTOR;
+	avail /= (1 << RESERVE_FACTOR) + 1;
+	return avail;
+}
+
+/* Used by gc when it's starting: */
+void bch2_recalc_sectors_available(struct bch_fs *c)
+{
+	lg_global_lock(&c->usage_lock);
+	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
 	lg_global_unlock(&c->usage_lock);
 }
 
-void bch2_disk_reservation_put(struct bch_fs *c,
-			      struct disk_reservation *res)
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
-	if (res->sectors) {
-		lg_local_lock(&c->usage_lock);
-		this_cpu_sub(c->usage_percpu->online_reserved,
-			     res->sectors);
+	lg_local_lock(&c->usage_lock);
+	this_cpu_sub(c->usage_percpu->online_reserved,
+		     res->sectors);
 
-		bch2_fs_stats_verify(c);
-		lg_local_unlock(&c->usage_lock);
+	bch2_fs_stats_verify(c);
+	lg_local_unlock(&c->usage_lock);
 
-		res->sectors = 0;
-	}
+	res->sectors = 0;
 }
 
 #define SECTORS_CACHE	1024
 
-int bch2_disk_reservation_add(struct bch_fs *c,
-			     struct disk_reservation *res,
-			     unsigned sectors, int flags)
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      unsigned sectors, int flags)
 {
 	struct bch_fs_usage *stats;
-	u64 old, new, v;
+	u64 old, v, get;
 	s64 sectors_available;
 	int ret;
 
@@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c,
 	lg_local_lock(&c->usage_lock);
 	stats = this_cpu_ptr(c->usage_percpu);
 
-	if (sectors >= stats->available_cache)
+	if (sectors <= stats->available_cache)
 		goto out;
 
 	v = atomic64_read(&c->sectors_available);
 	do {
 		old = v;
-		if (old < sectors) {
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
 			lg_local_unlock(&c->usage_lock);
 			goto recalculate;
 		}
-
-		new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
 	} while ((v = atomic64_cmpxchg(&c->sectors_available,
-				       old, new)) != old);
+				       old, old - get)) != old);
+
+	stats->available_cache	+= get;
 
-	stats->available_cache	+= old - new;
 out:
 	stats->available_cache	-= sectors;
 	stats->online_reserved	+= sectors;
 	res->sectors		+= sectors;
 
+	bch2_disk_reservations_verify(c, flags);
 	bch2_fs_stats_verify(c);
 	lg_local_unlock(&c->usage_lock);
 	return 0;
@@ -738,6 +766,8 @@ recalculate:
 		stats->online_reserved	+= sectors;
 		res->sectors		+= sectors;
 		ret = 0;
+
+		bch2_disk_reservations_verify(c, flags);
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
 		ret = -ENOSPC;