#include "bcachefs.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "extents.h"

#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>

#define DEF_BTREE_ID(kwd, val, name) name,

const char * const bch2_btree_ids[] = {
	DEFINE_BCH_BTREE_IDS()
	NULL
};

#undef DEF_BTREE_ID

void bch2_recalc_btree_reserve(struct bch_fs *c)
{
	unsigned i, reserve = 16;

	if (!c->btree_roots[0].b)
		reserve += 8;

	for (i = 0; i < BTREE_ID_NR; i++)
		if (c->btree_roots[i].b)
			reserve += min_t(unsigned, 1,
					 c->btree_roots[i].b->level) * 8;

	c->btree_cache_reserve = reserve;
}

#define mca_can_free(c)						\
	max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)

static void __mca_data_free(struct bch_fs *c, struct btree *b)
{
	EBUG_ON(btree_node_write_in_flight(b));

	free_pages((unsigned long) b->data, btree_page_order(c));
	b->data = NULL;
	bch2_btree_keys_free(b);
}

static void mca_data_free(struct bch_fs *c, struct btree *b)
{
	__mca_data_free(c, b);
	c->btree_cache_used--;
	list_move(&b->list, &c->btree_cache_freed);
}

#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])

static const struct rhashtable_params bch_btree_cache_params = {
	.head_offset	= offsetof(struct btree, hash),
	.key_offset	= offsetof(struct btree, key.v),
	.key_len	= sizeof(struct bch_extent_ptr),
};

static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
	unsigned order = ilog2(btree_pages(c));

	b->data = (void *) __get_free_pages(gfp, order);
	if (!b->data)
		goto err;

	if (bch2_btree_keys_alloc(b, order, gfp))
		goto err;

	c->btree_cache_used++;
	list_move(&b->list, &c->btree_cache_freeable);
	return;
err:
	free_pages((unsigned long) b->data, order);
	b->data = NULL;
	list_move(&b->list, &c->btree_cache_freed);
}

static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
{
	struct btree *b = kzalloc(sizeof(struct btree), gfp);
	if (!b)
		return NULL;

	bkey_extent_init(&b->key);
	six_lock_init(&b->lock);
	INIT_LIST_HEAD(&b->list);
	INIT_LIST_HEAD(&b->write_blocked);
	INIT_LIST_HEAD(&b->reachable);

	mca_data_alloc(c, b, gfp);
	return b->data ? b : NULL;
}

/* Btree in memory cache - hash table */

void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
{
	BUG_ON(btree_node_dirty(b));

	b->nsets = 0;

	rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
			       bch_btree_cache_params);

	/* Cause future lookups for this node to fail: */
	bkey_i_to_extent(&b->key)->v._data[0] = 0;
}

int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b,
		    unsigned level, enum btree_id id)
{
	int ret;
	b->level	= level;
	b->btree_id	= id;

	ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
					    bch_btree_cache_params);
	if (ret)
		return ret;

	mutex_lock(&c->btree_cache_lock);
	list_add(&b->list, &c->btree_cache);
	mutex_unlock(&c->btree_cache_lock);

	return 0;
}

__flatten
static inline struct btree *mca_find(struct bch_fs *c,
				     const struct bkey_i *k)
{
	return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
				      bch_btree_cache_params);
}

/*
 * this version is for btree nodes that have already been freed (we're not
 * reaping a real btree node)
 */
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
{
	int ret = 0;

	lockdep_assert_held(&c->btree_cache_lock);

	if (!six_trylock_intent(&b->lock))
		return -ENOMEM;

	if (!six_trylock_write(&b->lock))
		goto out_unlock_intent;

	if (btree_node_write_error(b) ||
	    btree_node_noevict(b))
		goto out_unlock;

	if (!btree_node_may_write(b))
		goto out_unlock;

	if (btree_node_dirty(b) ||
	    btree_node_write_in_flight(b)) {
		if (!flush)
			goto out_unlock;

		/*
		 * Using the underscore version because we don't want to compact
		 * bsets after the write, since this node is about to be evicted
		 * - unless btree verify mode is enabled, since it runs out of
		 * the post write cleanup:
		 */
		if (verify_btree_ondisk(c))
			bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent);
		else
			__bch2_btree_node_write(c, b, NULL, SIX_LOCK_read);

		/* wait for any in flight btree write */
		btree_node_wait_on_io(b);
	}
out:
	if (PTR_HASH(&b->key))
		trace_btree_node_reap(c, b, ret);
	return ret;
out_unlock:
	six_unlock_write(&b->lock);
out_unlock_intent:
	six_unlock_intent(&b->lock);
	ret = -ENOMEM;
	goto out;
}

static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
{
	return __btree_node_reclaim(c, b, false);
}

static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
	return __btree_node_reclaim(c, b, true);
}

static unsigned long bch2_mca_scan(struct shrinker *shrink,
				   struct shrink_control *sc)
{
	struct bch_fs *c = container_of(shrink, struct bch_fs,
					   btree_cache_shrink);
	struct btree *b, *t;
	unsigned long nr = sc->nr_to_scan;
	unsigned long can_free;
	unsigned long touched = 0;
	unsigned long freed = 0;
	unsigned i;

	if (btree_shrinker_disabled(c))
		return SHRINK_STOP;

	if (c->btree_cache_alloc_lock)
		return SHRINK_STOP;

	/* Return -1 if we can't do anything right now */
	if (sc->gfp_mask & __GFP_IO)
		mutex_lock(&c->btree_cache_lock);
	else if (!mutex_trylock(&c->btree_cache_lock))
		return -1;

	/*
	 * It's _really_ critical that we don't free too many btree nodes - we
	 * have to always leave ourselves a reserve. The reserve is how we
	 * guarantee that allocating memory for a new btree node can always
	 * succeed, so that inserting keys into the btree can always succeed and
	 * IO can always make forward progress:
	 */
	nr /= btree_pages(c);
	can_free = mca_can_free(c);
	nr = min_t(unsigned long, nr, can_free);

	i = 0;
	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
		touched++;

		if (freed >= nr)
			break;

		if (++i > 3 &&
		    !btree_node_reclaim(c, b)) {
			mca_data_free(c, b);
			six_unlock_write(&b->lock);
			six_unlock_intent(&b->lock);
			freed++;
		}
	}
restart:
	list_for_each_entry_safe(b, t, &c->btree_cache, list) {
		touched++;

		if (freed >= nr) {
			/* Save position */
			if (&t->list != &c->btree_cache)
				list_move_tail(&c->btree_cache, &t->list);
			break;
		}

		if (!btree_node_accessed(b) &&
		    !btree_node_reclaim(c, b)) {
			/* can't call bch2_btree_node_hash_remove under btree_cache_lock  */
			freed++;
			if (&t->list != &c->btree_cache)
				list_move_tail(&c->btree_cache, &t->list);

			mca_data_free(c, b);
			mutex_unlock(&c->btree_cache_lock);

			bch2_btree_node_hash_remove(c, b);
			six_unlock_write(&b->lock);
			six_unlock_intent(&b->lock);

			if (freed >= nr)
				goto out;

			if (sc->gfp_mask & __GFP_IO)
				mutex_lock(&c->btree_cache_lock);
			else if (!mutex_trylock(&c->btree_cache_lock))
				goto out;
			goto restart;
		} else
			clear_btree_node_accessed(b);
	}

	mutex_unlock(&c->btree_cache_lock);
out:
	return (unsigned long) freed * btree_pages(c);
}

static unsigned long bch2_mca_count(struct shrinker *shrink,
				    struct shrink_control *sc)
{
	struct bch_fs *c = container_of(shrink, struct bch_fs,
					   btree_cache_shrink);

	if (btree_shrinker_disabled(c))
		return 0;

	if (c->btree_cache_alloc_lock)
		return 0;

	return mca_can_free(c) * btree_pages(c);
}

void bch2_fs_btree_exit(struct bch_fs *c)
{
	struct btree *b;
	unsigned i;

	if (c->btree_cache_shrink.list.next)
		unregister_shrinker(&c->btree_cache_shrink);

	mutex_lock(&c->btree_cache_lock);

#ifdef CONFIG_BCACHEFS_DEBUG
	if (c->verify_data)
		list_move(&c->verify_data->list, &c->btree_cache);

	free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c)));
#endif

	for (i = 0; i < BTREE_ID_NR; i++)
		if (c->btree_roots[i].b)
			list_add(&c->btree_roots[i].b->list, &c->btree_cache);

	list_splice(&c->btree_cache_freeable,
		    &c->btree_cache);

	while (!list_empty(&c->btree_cache)) {
		b = list_first_entry(&c->btree_cache, struct btree, list);

		if (btree_node_dirty(b))
			bch2_btree_complete_write(c, b, btree_current_write(b));
		clear_btree_node_dirty(b);

		mca_data_free(c, b);
	}

	while (!list_empty(&c->btree_cache_freed)) {
		b = list_first_entry(&c->btree_cache_freed,
				     struct btree, list);
		list_del(&b->list);
		kfree(b);
	}

	mutex_unlock(&c->btree_cache_lock);

	if (c->btree_cache_table_init_done)
		rhashtable_destroy(&c->btree_cache_table);
}

int bch2_fs_btree_init(struct bch_fs *c)
{
	unsigned i;
	int ret;

	ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
	if (ret)
		return ret;

	c->btree_cache_table_init_done = true;

	bch2_recalc_btree_reserve(c);

	for (i = 0; i < c->btree_cache_reserve; i++)
		if (!mca_bucket_alloc(c, GFP_KERNEL))
			return -ENOMEM;

	list_splice_init(&c->btree_cache,
			 &c->btree_cache_freeable);

#ifdef CONFIG_BCACHEFS_DEBUG
	mutex_init(&c->verify_lock);

	c->verify_ondisk = (void *)
		__get_free_pages(GFP_KERNEL, ilog2(btree_pages(c)));
	if (!c->verify_ondisk)
		return -ENOMEM;

	c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
	if (!c->verify_data)
		return -ENOMEM;

	list_del_init(&c->verify_data->list);
#endif

	c->btree_cache_shrink.count_objects = bch2_mca_count;
	c->btree_cache_shrink.scan_objects = bch2_mca_scan;
	c->btree_cache_shrink.seeks = 4;
	c->btree_cache_shrink.batch = btree_pages(c) * 2;
	register_shrinker(&c->btree_cache_shrink);

	return 0;
}

/*
 * We can only have one thread cannibalizing other cached btree nodes at a time,
 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 * cannibalize_bucket() will take. This means every time we unlock the root of
 * the btree, we need to release this lock if we have it held.
 */
void bch2_btree_node_cannibalize_unlock(struct bch_fs *c)
{
	if (c->btree_cache_alloc_lock == current) {
		trace_btree_node_cannibalize_unlock(c);
		c->btree_cache_alloc_lock = NULL;
		closure_wake_up(&c->mca_wait);
	}
}

int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl)
{
	struct task_struct *old;

	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
	if (old == NULL || old == current)
		goto success;

	if (!cl) {
		trace_btree_node_cannibalize_lock_fail(c);
		return -ENOMEM;
	}

	closure_wait(&c->mca_wait, cl);

	/* Try again, after adding ourselves to waitlist */
	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
	if (old == NULL || old == current) {
		/* We raced */
		closure_wake_up(&c->mca_wait);
		goto success;
	}

	trace_btree_node_cannibalize_lock_fail(c);
	return -EAGAIN;

success:
	trace_btree_node_cannibalize_lock(c);
	return 0;
}

static struct btree *mca_cannibalize(struct bch_fs *c)
{
	struct btree *b;

	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!btree_node_reclaim(c, b))
			return b;

	while (1) {
		list_for_each_entry_reverse(b, &c->btree_cache, list)
			if (!btree_node_write_and_reclaim(c, b))
				return b;

		/*
		 * Rare case: all nodes were intent-locked.
		 * Just busy-wait.
		 */
		WARN_ONCE(1, "btree cache cannibalize failed\n");
		cond_resched();
	}
}

struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
{
	struct btree *b;
	u64 start_time = local_clock();

	mutex_lock(&c->btree_cache_lock);

	/*
	 * btree_free() doesn't free memory; it sticks the node on the end of
	 * the list. Check if there's any freed nodes there:
	 */
	list_for_each_entry(b, &c->btree_cache_freeable, list)
		if (!btree_node_reclaim(c, b))
			goto out_unlock;

	/*
	 * We never free struct btree itself, just the memory that holds the on
	 * disk node. Check the freed list before allocating a new one:
	 */
	list_for_each_entry(b, &c->btree_cache_freed, list)
		if (!btree_node_reclaim(c, b)) {
			mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
			if (b->data)
				goto out_unlock;

			six_unlock_write(&b->lock);
			six_unlock_intent(&b->lock);
			goto err;
		}

	b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
	if (!b)
		goto err;

	BUG_ON(!six_trylock_intent(&b->lock));
	BUG_ON(!six_trylock_write(&b->lock));
out_unlock:
	BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key));
	BUG_ON(btree_node_write_in_flight(b));

	list_del_init(&b->list);
	mutex_unlock(&c->btree_cache_lock);
out:
	b->flags		= 0;
	b->written		= 0;
	b->nsets		= 0;
	b->sib_u64s[0]		= 0;
	b->sib_u64s[1]		= 0;
	b->whiteout_u64s	= 0;
	b->uncompacted_whiteout_u64s = 0;
	bch2_btree_keys_init(b, &c->expensive_debug_checks);

	bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);

	return b;
err:
	/* Try to cannibalize another cached btree node: */
	if (c->btree_cache_alloc_lock == current) {
		b = mca_cannibalize(c);
		list_del_init(&b->list);
		mutex_unlock(&c->btree_cache_lock);

		bch2_btree_node_hash_remove(c, b);

		trace_btree_node_cannibalize(c);
		goto out;
	}

	mutex_unlock(&c->btree_cache_lock);
	return ERR_PTR(-ENOMEM);
}

/* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter,
						   const struct bkey_i *k,
						   unsigned level,
						   enum six_lock_type lock_type)
{
	struct bch_fs *c = iter->c;
	struct btree *b;

	b = bch2_btree_node_mem_alloc(c);
	if (IS_ERR(b))
		return b;

	bkey_copy(&b->key, k);
	if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) {
		/* raced with another fill: */

		/* mark as unhashed... */
		bkey_i_to_extent(&b->key)->v._data[0] = 0;

		mutex_lock(&c->btree_cache_lock);
		list_add(&b->list, &c->btree_cache_freeable);
		mutex_unlock(&c->btree_cache_lock);

		six_unlock_write(&b->lock);
		six_unlock_intent(&b->lock);
		return NULL;
	}

	/*
	 * If the btree node wasn't cached, we can't drop our lock on
	 * the parent until after it's added to the cache - because
	 * otherwise we could race with a btree_split() freeing the node
	 * we're trying to lock.
	 *
	 * But the deadlock described below doesn't exist in this case,
	 * so it's safe to not drop the parent lock until here:
	 */
	if (btree_node_read_locked(iter, level + 1))
		btree_node_unlock(iter, level + 1);

	bch2_btree_node_read(c, b);
	six_unlock_write(&b->lock);

	if (lock_type == SIX_LOCK_read)
		six_lock_downgrade(&b->lock);

	return b;
}

/**
 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 * in from disk if necessary.
 *
 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
 *
 * The btree node will have either a read or a write lock held, depending on
 * the @write parameter.
 */
struct btree *bch2_btree_node_get(struct btree_iter *iter,
				  const struct bkey_i *k, unsigned level,
				  enum six_lock_type lock_type)
{
	struct btree *b;
	struct bset_tree *t;

	BUG_ON(level >= BTREE_MAX_DEPTH);
retry:
	rcu_read_lock();
	b = mca_find(iter->c, k);
	rcu_read_unlock();

	if (unlikely(!b)) {
		/*
		 * We must have the parent locked to call bch2_btree_node_fill(),
		 * else we could read in a btree node from disk that's been
		 * freed:
		 */
		b = bch2_btree_node_fill(iter, k, level, lock_type);

		/* We raced and found the btree node in the cache */
		if (!b)
			goto retry;

		if (IS_ERR(b))
			return b;
	} else {
		/*
		 * There's a potential deadlock with splits and insertions into
		 * interior nodes we have to avoid:
		 *
		 * The other thread might be holding an intent lock on the node
		 * we want, and they want to update its parent node so they're
		 * going to upgrade their intent lock on the parent node to a
		 * write lock.
		 *
		 * But if we're holding a read lock on the parent, and we're
		 * trying to get the intent lock they're holding, we deadlock.
		 *
		 * So to avoid this we drop the read locks on parent nodes when
		 * we're starting to take intent locks - and handle the race.
		 *
		 * The race is that they might be about to free the node we
		 * want, and dropping our read lock on the parent node lets them
		 * update the parent marking the node we want as freed, and then
		 * free it:
		 *
		 * To guard against this, btree nodes are evicted from the cache
		 * when they're freed - and PTR_HASH() is zeroed out, which we
		 * check for after we lock the node.
		 *
		 * Then, bch2_btree_node_relock() on the parent will fail - because
		 * the parent was modified, when the pointer to the node we want
		 * was removed - and we'll bail out:
		 */
		if (btree_node_read_locked(iter, level + 1))
			btree_node_unlock(iter, level + 1);

		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
			return ERR_PTR(-EINTR);

		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
			     b->level != level ||
			     race_fault())) {
			six_unlock_type(&b->lock, lock_type);
			if (bch2_btree_node_relock(iter, level + 1))
				goto retry;

			return ERR_PTR(-EINTR);
		}
	}

	prefetch(b->aux_data);

	for_each_bset(b, t) {
		void *p = (u64 *) b->aux_data + t->aux_data_offset;

		prefetch(p + L1_CACHE_BYTES * 0);
		prefetch(p + L1_CACHE_BYTES * 1);
		prefetch(p + L1_CACHE_BYTES * 2);
	}

	/* avoid atomic set bit if it's not needed: */
	if (btree_node_accessed(b))
		set_btree_node_accessed(b);

	if (unlikely(btree_node_read_error(b))) {
		six_unlock_type(&b->lock, lock_type);
		return ERR_PTR(-EIO);
	}

	EBUG_ON(!b->written);
	EBUG_ON(b->btree_id != iter->btree_id ||
		BTREE_NODE_LEVEL(b->data) != level ||
		bkey_cmp(b->data->max_key, k->k.p));

	return b;
}

int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
			  char *buf, size_t len)
{
	const struct bkey_format *f = &b->format;
	struct bset_stats stats;
	char ptrs[100];

	memset(&stats, 0, sizeof(stats));

	bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
			bkey_i_to_s_c(&b->key));
	bch2_btree_keys_stats(b, &stats);

	return scnprintf(buf, len,
			 "l %u %llu:%llu - %llu:%llu:\n"
			 "    ptrs: %s\n"
			 "    format: u64s %u fields %u %u %u %u %u\n"
			 "    unpack fn len: %u\n"
			 "    bytes used %zu/%zu (%zu%% full)\n"
			 "    sib u64s: %u, %u (merge threshold %zu)\n"
			 "    nr packed keys %u\n"
			 "    nr unpacked keys %u\n"
			 "    floats %zu\n"
			 "    failed unpacked %zu\n"
			 "    failed prev %zu\n"
			 "    failed overflow %zu\n",
			 b->level,
			 b->data->min_key.inode,
			 b->data->min_key.offset,
			 b->data->max_key.inode,
			 b->data->max_key.offset,
			 ptrs,
			 f->key_u64s,
			 f->bits_per_field[0],
			 f->bits_per_field[1],
			 f->bits_per_field[2],
			 f->bits_per_field[3],
			 f->bits_per_field[4],
			 b->unpack_fn_len,
			 b->nr.live_u64s * sizeof(u64),
			 btree_bytes(c) - sizeof(struct btree_node),
			 b->nr.live_u64s * 100 / btree_max_u64s(c),
			 b->sib_u64s[0],
			 b->sib_u64s[1],
			 BTREE_FOREGROUND_MERGE_THRESHOLD(c),
			 b->nr.packed_keys,
			 b->nr.unpacked_keys,
			 stats.floats,
			 stats.failed_unpacked,
			 stats.failed_prev,
			 stats.failed_overflow);
}