Rename from bcache-tools to bcachefs-tools

author: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-19 15:56:34 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-19 17:31:47 -0800
commit: 5ec39af8eaba49aee7bafa44c661da39e2f40dc3 (patch)
tree: 1fb1a981602cbf22c7d2b2dba1168c715d7cecb5 /include
parent: bb1941de5378a7b8122d3575dcbc7d0aeb6326f0 (diff)
5 files changed, 518 insertions, 2098 deletions
diff --git a/include/linux/bcache-ioctl.h b/include/linux/bcache-ioctl.h
deleted file mode 100644
index ca769369..00000000
--- a/include/linux/bcache-ioctl.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef _LINUX_BCACHE_IOCTL_H
-#define _LINUX_BCACHE_IOCTL_H
-
-#include <linux/bcache.h>
-#include <linux/uuid.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
-#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
-#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
-#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
-
-#define BCH_FORCE_IF_DEGRADED			\
-	(BCH_FORCE_IF_DATA_DEGRADED|		\
-	 BCH_FORCE_IF_METADATA_DEGRADED)
-
-#define BCH_BY_UUID			(1 << 4)
-
-/* global control dev: */
-
-#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
-#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
-
-struct bch_ioctl_assemble {
-	__u32			flags;
-	__u32			nr_devs;
-	__u64			pad;
-	__u64			devs[];
-};
-
-struct bch_ioctl_incremental {
-	__u32			flags;
-	__u64			pad;
-	__u64			dev;
-};
-
-/* filesystem ioctls: */
-
-#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
-#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
-#define BCH_IOCTL_STOP		_IO(0xbc,	3)
-#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DISK_EVACUATE	_IOW(0xbc,	9,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
-
-struct bch_ioctl_query_uuid {
-	uuid_le			uuid;
-};
-
-struct bch_ioctl_start {
-	__u32			flags;
-	__u32			pad;
-};
-
-struct bch_ioctl_disk {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-};
-
-struct bch_ioctl_disk_set_state {
-	__u32			flags;
-	__u8			new_state;
-	__u8			pad[3];
-	__u64			dev;
-};
-
-#define BCH_REWRITE_INCREASE_REPLICAS	(1 << 0)
-#define BCH_REWRITE_DECREASE_REPLICAS	(1 << 1)
-
-#define BCH_REWRITE_RECOMPRESS		(1 << 0)
-#define BCH_REWRITE_DECREASE_REPLICAS	(1 << 1)
-
-enum bch_data_ops {
-	BCH_DATA_SCRUB,
-};
-
-struct bch_data_op {
-	__u8			type;
-};
-
-struct bch_ioctl_data {
-	__u32			flags;
-	__u32			pad;
-
-	__u64			start_inode;
-	__u64			start_offset;
-
-	__u64			end_inode;
-	__u64			end_offset;
-};
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _LINUX_BCACHE_IOCTL_H */
diff --git a/include/linux/bcache.h b/include/linux/bcache.h
deleted file mode 100644
index c221747b..00000000
--- a/include/linux/bcache.h
+++ /dev/null
@@ -1,1449 +0,0 @@
-#ifndef _LINUX_BCACHE_H
-#define _LINUX_BCACHE_H
-
-/*
- * Bcache on disk data structures
- */
-
-#ifdef __cplusplus
-typedef bool _Bool;
-extern "C" {
-#endif
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/uuid.h>
-
-#define LE32_BITMASK(name, type, field, offset, end)			\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
-static const __u64	name##_MAX = (1ULL << (end - offset)) - 1;	\
-									\
-static inline __u64 name(const type *k)					\
-{									\
-	return (__le32_to_cpu(k->field) >> offset) &			\
-		~(~0ULL << (end - offset));				\
-}									\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	__u64 new = __le32_to_cpu(k->field);				\
-									\
-	new &= ~(~(~0ULL << (end - offset)) << offset);			\
-	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
-	k->field = __cpu_to_le32(new);					\
-}
-
-#define LE64_BITMASK(name, type, field, offset, end)			\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
-static const __u64	name##_MAX = (1ULL << (end - offset)) - 1;	\
-									\
-static inline __u64 name(const type *k)					\
-{									\
-	return (__le64_to_cpu(k->field) >> offset) &			\
-		~(~0ULL << (end - offset));				\
-}									\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	__u64 new = __le64_to_cpu(k->field);				\
-									\
-	new &= ~(~(~0ULL << (end - offset)) << offset);			\
-	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
-	k->field = __cpu_to_le64(new);					\
-}
-
-struct bkey_format {
-	__u8		key_u64s;
-	__u8		nr_fields;
-	/* One unused slot for now: */
-	__u8		bits_per_field[6];
-	__le64		field_offset[6];
-};
-
-/* Btree keys - all units are in sectors */
-
-struct bpos {
-	/* Word order matches machine byte order */
-#if defined(__LITTLE_ENDIAN)
-	__u32		snapshot;
-	__u64		offset;
-	__u64		inode;
-#elif defined(__BIG_ENDIAN)
-	__u64		inode;
-	__u64		offset;		/* Points to end of extent - sectors */
-	__u32		snapshot;
-#else
-#error edit for your odd byteorder.
-#endif
-} __attribute__((packed, aligned(4)));
-
-#define KEY_INODE_MAX			((__u64)~0ULL)
-#define KEY_OFFSET_MAX			((__u64)~0ULL)
-#define KEY_SNAPSHOT_MAX		((__u32)~0U)
-
-static inline struct bpos POS(__u64 inode, __u64 offset)
-{
-	struct bpos ret;
-
-	ret.inode	= inode;
-	ret.offset	= offset;
-	ret.snapshot	= 0;
-
-	return ret;
-}
-
-#define POS_MIN				POS(0, 0)
-#define POS_MAX				POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
-
-/* Empty placeholder struct, for container_of() */
-struct bch_val {
-	__u64		__nothing[0];
-};
-
-struct bversion {
-#if defined(__LITTLE_ENDIAN)
-	__u64		lo;
-	__u32		hi;
-#elif defined(__BIG_ENDIAN)
-	__u32		hi;
-	__u64		lo;
-#endif
-} __attribute__((packed, aligned(4)));
-
-struct bkey {
-	/* Size of combined key and value, in u64s */
-	__u8		u64s;
-
-	/* Format of key (0 for format local to btree node) */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8		format:7,
-			needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u8		needs_whiteout:1,
-			format:7;
-#else
-#error edit for your odd byteorder.
-#endif
-
-	/* Type of the value */
-	__u8		type;
-
-#if defined(__LITTLE_ENDIAN)
-	__u8		pad[1];
-
-	struct bversion	version;
-	__u32		size;		/* extent size, in sectors */
-	struct bpos	p;
-#elif defined(__BIG_ENDIAN)
-	struct bpos	p;
-	__u32		size;		/* extent size, in sectors */
-	struct bversion	version;
-
-	__u8		pad[1];
-#endif
-} __attribute__((packed, aligned(8)));
-
-struct bkey_packed {
-	__u64		_data[0];
-
-	/* Size of combined key and value, in u64s */
-	__u8		u64s;
-
-	/* Format of key (0 for format local to btree node) */
-
-	/*
-	 * XXX: next incompat on disk format change, switch format and
-	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
-	 * bits of the bitfield
-	 */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8		format:7,
-			needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u8		needs_whiteout:1,
-			format:7;
-#endif
-
-	/* Type of the value */
-	__u8		type;
-	__u8		key_start[0];
-
-	/*
-	 * We copy bkeys with struct assignment in various places, and while
-	 * that shouldn't be done with packed bkeys we can't disallow it in C,
-	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
-	 * to the same size as struct bkey should hopefully be safest.
-	 */
-	__u8		pad[sizeof(struct bkey) - 3];
-} __attribute__((packed, aligned(8)));
-
-#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
-#define KEY_PACKED_BITS_START		24
-
-#define KEY_SIZE_MAX			((__u32)~0U)
-
-#define KEY_FORMAT_LOCAL_BTREE		0
-#define KEY_FORMAT_CURRENT		1
-
-enum bch_bkey_fields {
-	BKEY_FIELD_INODE,
-	BKEY_FIELD_OFFSET,
-	BKEY_FIELD_SNAPSHOT,
-	BKEY_FIELD_SIZE,
-	BKEY_FIELD_VERSION_HI,
-	BKEY_FIELD_VERSION_LO,
-	BKEY_NR_FIELDS,
-};
-
-#define bkey_format_field(name, field)					\
-	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
-
-#define BKEY_FORMAT_CURRENT						\
-((struct bkey_format) {							\
-	.key_u64s	= BKEY_U64s,					\
-	.nr_fields	= BKEY_NR_FIELDS,				\
-	.bits_per_field = {						\
-		bkey_format_field(INODE,	p.inode),		\
-		bkey_format_field(OFFSET,	p.offset),		\
-		bkey_format_field(SNAPSHOT,	p.snapshot),		\
-		bkey_format_field(SIZE,		size),			\
-		bkey_format_field(VERSION_HI,	version.hi),		\
-		bkey_format_field(VERSION_LO,	version.lo),		\
-	},								\
-})
-
-/* bkey with inline value */
-struct bkey_i {
-	__u64			_data[0];
-
-	union {
-	struct {
-		/* Size of combined key and value, in u64s */
-		__u8		u64s;
-	};
-	struct {
-		struct bkey	k;
-		struct bch_val	v;
-	};
-	};
-};
-
-#ifndef __cplusplus
-
-#define KEY(_inode, _offset, _size)					\
-((struct bkey) {							\
-	.u64s		= BKEY_U64s,					\
-	.format		= KEY_FORMAT_CURRENT,				\
-	.p		= POS(_inode, _offset),				\
-	.size		= _size,					\
-})
-
-#else
-
-static inline struct bkey KEY(__u64 inode, __u64 offset, __u64 size)
-{
-	struct bkey ret;
-
-	memset(&ret, 0, sizeof(ret));
-	ret.u64s	= BKEY_U64s;
-	ret.format	= KEY_FORMAT_CURRENT;
-	ret.p.inode	= inode;
-	ret.p.offset	= offset;
-	ret.size	= size;
-
-	return ret;
-}
-
-#endif
-
-static inline void bkey_init(struct bkey *k)
-{
-	*k = KEY(0, 0, 0);
-}
-
-#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
-
-#define __BKEY_PADDED(key, pad)					\
-	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-#define BKEY_VAL_TYPE(name, nr)						\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-}
-
-/*
- * - DELETED keys are used internally to mark keys that should be ignored but
- *   override keys in composition order.  Their version number is ignored.
- *
- * - DISCARDED keys indicate that the data is all 0s because it has been
- *   discarded. DISCARDs may have a version; if the version is nonzero the key
- *   will be persistent, otherwise the key will be dropped whenever the btree
- *   node is rewritten (like DELETED keys).
- *
- * - ERROR: any read of the data returns a read error, as the data was lost due
- *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
- *   by new writes or cluster-wide GC. Node repair can also overwrite them with
- *   the same or a more recent version number, but not with an older version
- *   number.
-*/
-#define KEY_TYPE_DELETED		0
-#define KEY_TYPE_DISCARD		1
-#define KEY_TYPE_ERROR			2
-#define KEY_TYPE_COOKIE			3
-#define KEY_TYPE_PERSISTENT_DISCARD	4
-#define KEY_TYPE_GENERIC_NR		128
-
-struct bch_cookie {
-	struct bch_val		v;
-	__le64			cookie;
-};
-BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
-
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the start of the data that
- * is currently live. The size field in struct bkey records the current (live)
- * size of the extent, and is also used to mean "size of region on disk that we
- * point to" in this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32	- 0b1
- * bch_extent_ptr	- 0b10
- * bch_extent_crc64	- 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
-	__le64			lo;
-	__le64			hi;
-} __attribute__((packed, aligned(8)));
-
-#define BCH_CSUM_NONE			0U
-#define BCH_CSUM_CRC32C			1U
-#define BCH_CSUM_CRC64			2U
-#define BCH_CSUM_CHACHA20_POLY1305_80	3U
-#define BCH_CSUM_CHACHA20_POLY1305_128	4U
-#define BCH_CSUM_NR			5U
-
-static inline _Bool bch_csum_type_is_encryption(unsigned type)
-{
-	switch (type) {
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-enum bch_extent_entry_type {
-	BCH_EXTENT_ENTRY_ptr		= 0,
-	BCH_EXTENT_ENTRY_crc32		= 1,
-	BCH_EXTENT_ENTRY_crc64		= 2,
-	BCH_EXTENT_ENTRY_crc128		= 3,
-};
-
-#define BCH_EXTENT_ENTRY_MAX		4
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u32			type:2,
-				_compressed_size:7,
-				_uncompressed_size:7,
-				offset:7,
-				_unused:1,
-				csum_type:4,
-				compression_type:4;
-	__u32			csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u32			csum;
-	__u32			compression_type:4,
-				csum_type:4,
-				_unused:1,
-				offset:7,
-				_uncompressed_size:7,
-				_compressed_size:7,
-				type:2;
-#endif
-} __attribute__((packed, aligned(8)));
-
-#define CRC32_SIZE_MAX		(1U << 7)
-#define CRC32_NONCE_MAX		0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:3,
-				_compressed_size:9,
-				_uncompressed_size:9,
-				offset:9,
-				nonce:10,
-				csum_type:4,
-				compression_type:4,
-				csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			csum_hi:16,
-				compression_type:4,
-				csum_type:4,
-				nonce:10,
-				offset:9,
-				_uncompressed_size:9,
-				_compressed_size:9,
-				type:3;
-#endif
-	__u64			csum_lo;
-} __attribute__((packed, aligned(8)));
-
-#define CRC64_SIZE_MAX		(1U << 9)
-#define CRC64_NONCE_MAX		((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:4,
-				_compressed_size:13,
-				_uncompressed_size:13,
-				offset:13,
-				nonce:13,
-				csum_type:4,
-				compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			compression_type:4,
-				csum_type:4,
-				nonce:14,
-				offset:13,
-				_uncompressed_size:13,
-				_compressed_size:13,
-				type:3;
-#endif
-	struct bch_csum		csum;
-} __attribute__((packed, aligned(8)));
-
-#define CRC128_SIZE_MAX		(1U << 13)
-#define CRC128_NONCE_MAX	((1U << 13) - 1)
-
-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-#define BCH_ENCODED_EXTENT_MAX	128U
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:1,
-				cached:1,
-				erasure_coded:1,
-				reservation:1,
-				offset:44, /* 8 petabytes */
-				dev:8,
-				gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			gen:8,
-				dev:8,
-				offset:44,
-				reservation:1,
-				erasure_coded:1,
-				cached:1,
-				type:1;
-#endif
-} __attribute__((packed, aligned(8)));
-
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:5,
-				unused:23,
-				replicas:4,
-				generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			generation:32,
-				replicas:4,
-				unused:23,
-				type:5;
-#endif
-};
-
-union bch_extent_entry {
-#if defined(__LITTLE_ENDIAN) ||  __BITS_PER_LONG == 64
-	unsigned long			type;
-#elif __BITS_PER_LONG == 32
-	struct {
-		unsigned long		pad;
-		unsigned long		type;
-	};
-#else
-#error edit for your odd byteorder.
-#endif
-	struct bch_extent_crc32		crc32;
-	struct bch_extent_crc64		crc64;
-	struct bch_extent_crc128	crc128;
-	struct bch_extent_ptr		ptr;
-};
-
-enum {
-	BCH_EXTENT		= 128,
-
-	/*
-	 * This is kind of a hack, we're overloading the type for a boolean that
-	 * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
-	 * have the same value type:
-	 */
-	BCH_EXTENT_CACHED	= 129,
-
-	/*
-	 * Persistent reservation:
-	 */
-	BCH_RESERVATION		= 130,
-};
-
-struct bch_extent {
-	struct bch_val		v;
-
-	union bch_extent_entry	start[0];
-	__u64			_data[0];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(extent,		BCH_EXTENT);
-
-struct bch_reservation {
-	struct bch_val		v;
-
-	__le32			generation;
-	__u8			nr_replicas;
-	__u8			pad[3];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
-	((sizeof(struct bch_extent_crc128) +			\
-	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
-
-/* Maximum possible size of an entire extent value: */
-/* There's a hack in the keylist code that needs to be fixed.. */
-#define BKEY_EXTENT_VAL_U64s_MAX				\
-	(BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
-	((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
-#define BKEY_BTREE_PTR_U64s_MAX					\
-	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX	4096
-
-#define BCACHE_ROOT_INO		4096
-
-enum bch_inode_types {
-	BCH_INODE_FS		= 128,
-	BCH_INODE_BLOCKDEV	= 129,
-};
-
-struct bch_inode {
-	struct bch_val		v;
-
-	__le64			i_hash_seed;
-	__le32			i_flags;
-	__le16			i_mode;
-	__u8			fields[0];
-} __attribute__((packed));
-BKEY_VAL_TYPE(inode,		BCH_INODE_FS);
-
-#define BCH_INODE_FIELDS()				\
-	BCH_INODE_FIELD(i_atime,	64)		\
-	BCH_INODE_FIELD(i_ctime,	64)		\
-	BCH_INODE_FIELD(i_mtime,	64)		\
-	BCH_INODE_FIELD(i_otime,	64)		\
-	BCH_INODE_FIELD(i_size,		64)		\
-	BCH_INODE_FIELD(i_sectors,	64)		\
-	BCH_INODE_FIELD(i_uid,		32)		\
-	BCH_INODE_FIELD(i_gid,		32)		\
-	BCH_INODE_FIELD(i_nlink,	32)		\
-	BCH_INODE_FIELD(i_generation,	32)		\
-	BCH_INODE_FIELD(i_dev,		32)
-
-enum {
-	/*
-	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
-	 * flags)
-	 */
-	__BCH_INODE_SYNC	= 0,
-	__BCH_INODE_IMMUTABLE	= 1,
-	__BCH_INODE_APPEND	= 2,
-	__BCH_INODE_NODUMP	= 3,
-	__BCH_INODE_NOATIME	= 4,
-
-	__BCH_INODE_I_SIZE_DIRTY= 5,
-	__BCH_INODE_I_SECTORS_DIRTY= 6,
-
-	/* not implemented yet: */
-	__BCH_INODE_HAS_XATTRS	= 7, /* has xattrs in xattr btree */
-
-	/* bits 20+ reserved for packed fields below: */
-};
-
-#define BCH_INODE_SYNC		(1 << __BCH_INODE_SYNC)
-#define BCH_INODE_IMMUTABLE	(1 << __BCH_INODE_IMMUTABLE)
-#define BCH_INODE_APPEND	(1 << __BCH_INODE_APPEND)
-#define BCH_INODE_NODUMP	(1 << __BCH_INODE_NODUMP)
-#define BCH_INODE_NOATIME	(1 << __BCH_INODE_NOATIME)
-#define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
-#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_HAS_XATTRS	(1 << __BCH_INODE_HAS_XATTRS)
-
-LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, i_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, i_flags, 24, 32);
-
-struct bch_inode_blockdev {
-	struct bch_val		v;
-
-	__le64			i_size;
-	__le64			i_flags;
-
-	/* Seconds: */
-	__le64			i_ctime;
-	__le64			i_mtime;
-
-	uuid_le			i_uuid;
-	__u8			i_label[32];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_blockdev,	BCH_INODE_BLOCKDEV);
-
-/* Thin provisioned volume, or cache for another block device? */
-LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-enum {
-	BCH_DIRENT		= 128,
-	BCH_DIRENT_WHITEOUT	= 129,
-};
-
-struct bch_dirent {
-	struct bch_val		v;
-
-	/* Target inode number: */
-	__le64			d_inum;
-
-	/*
-	 * Copy of mode bits 12-15 from the target inode - so userspace can get
-	 * the filetype without having to do a stat()
-	 */
-	__u8			d_type;
-
-	__u8			d_name[];
-} __attribute__((packed));
-BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
-
-/* Xattrs */
-
-enum {
-	BCH_XATTR		= 128,
-	BCH_XATTR_WHITEOUT	= 129,
-};
-
-#define BCH_XATTR_INDEX_USER			0
-#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS	1
-#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT	2
-#define BCH_XATTR_INDEX_TRUSTED			3
-#define BCH_XATTR_INDEX_SECURITY	        4
-
-struct bch_xattr {
-	struct bch_val		v;
-	__u8			x_type;
-	__u8			x_name_len;
-	__le16			x_val_len;
-	__u8			x_name[];
-} __attribute__((packed));
-BKEY_VAL_TYPE(xattr,		BCH_XATTR);
-
-/* Superblock */
-
-/* Version 0: Cache device
- * Version 1: Backing device
- * Version 2: Seed pointer into btree node checksum
- * Version 3: Cache device with new UUID format
- * Version 4: Backing device with data offset
- * Version 5: All the incompat changes
- * Version 6: Cache device UUIDs all in superblock, another incompat bset change
- * Version 7: Encryption (expanded checksum fields), other random things
- */
-#define BCACHE_SB_VERSION_CDEV_V0	0
-#define BCACHE_SB_VERSION_BDEV		1
-#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
-#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
-#define BCACHE_SB_VERSION_CDEV_V2	5
-#define BCACHE_SB_VERSION_CDEV_V3	6
-#define BCACHE_SB_VERSION_CDEV_V4	7
-#define BCACHE_SB_VERSION_CDEV		7
-#define BCACHE_SB_MAX_VERSION		7
-
-#define BCH_SB_SECTOR			8
-#define BCH_SB_LABEL_SIZE		32
-#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
-
-struct bch_member {
-	uuid_le			uuid;
-	__le64			nbuckets;	/* device size */
-	__le16			first_bucket;   /* index of first bucket used */
-	__le16			bucket_size;	/* sectors */
-	__le32			pad;
-	__le64			last_mount;	/* time_t */
-
-	__le64			flags[2];
-};
-
-LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
-LE64_BITMASK(BCH_MEMBER_TIER,		struct bch_member, flags[0],  4,  8)
-LE64_BITMASK(BCH_MEMBER_HAS_METADATA,	struct bch_member, flags[0],  8,  9)
-LE64_BITMASK(BCH_MEMBER_HAS_DATA,	struct bch_member, flags[0],  9, 10)
-LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
-LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15);
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-enum bch_member_state {
-	BCH_MEMBER_STATE_RW		= 0,
-	BCH_MEMBER_STATE_RO		= 1,
-	BCH_MEMBER_STATE_FAILED		= 2,
-	BCH_MEMBER_STATE_SPARE		= 3,
-	BCH_MEMBER_STATE_NR		= 4,
-};
-
-#define BCH_TIER_MAX			4U
-
-enum cache_replacement {
-	CACHE_REPLACEMENT_LRU		= 0,
-	CACHE_REPLACEMENT_FIFO		= 1,
-	CACHE_REPLACEMENT_RANDOM	= 2,
-	CACHE_REPLACEMENT_NR		= 3,
-};
-
-struct bch_sb_layout {
-	uuid_le			magic;	/* bcache superblock UUID */
-	__u8			layout_type;
-	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
-	__u8			nr_superblocks;
-	__u8			pad[5];
-	__u64			sb_offset[61];
-} __attribute__((packed));
-
-#define BCH_SB_LAYOUT_SECTOR	7
-
-struct bch_sb_field {
-	__u64			_data[0];
-	__le32			u64s;
-	__le32			type;
-};
-
-enum bch_sb_field_type {
-	BCH_SB_FIELD_journal	= 0,
-	BCH_SB_FIELD_members	= 1,
-	BCH_SB_FIELD_crypt	= 2,
-	BCH_SB_FIELD_NR		= 3,
-};
-
-struct bch_sb_field_journal {
-	struct bch_sb_field	field;
-	__le64			buckets[0];
-};
-
-struct bch_sb_field_members {
-	struct bch_sb_field	field;
-	struct bch_member	members[0];
-};
-
-/* Crypto: */
-
-struct nonce {
-	__le32			d[4];
-};
-
-struct bch_key {
-	__le64			key[4];
-};
-
-#define BCH_KEY_MAGIC					\
-	(((u64) 'b' <<  0)|((u64) 'c' <<  8)|		\
-	 ((u64) 'h' << 16)|((u64) '*' << 24)|		\
-	 ((u64) '*' << 32)|((u64) 'k' << 40)|		\
-	 ((u64) 'e' << 48)|((u64) 'y' << 56))
-
-struct bch_encrypted_key {
-	__le64			magic;
-	struct bch_key		key;
-};
-
-/*
- * If this field is present in the superblock, it stores an encryption key which
- * is used encrypt all other data/metadata. The key will normally be encrypted
- * with the key userspace provides, but if encryption has been turned off we'll
- * just store the master key unencrypted in the superblock so we can access the
- * previously encrypted data.
- */
-struct bch_sb_field_crypt {
-	struct bch_sb_field	field;
-
-	__le64			flags;
-	__le64			kdf_flags;
-	struct bch_encrypted_key key;
-};
-
-LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
-
-enum bch_kdf_types {
-	BCH_KDF_SCRYPT		= 0,
-	BCH_KDF_NR		= 1,
-};
-
-/* stored as base 2 log of scrypt params: */
-LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
-LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
-LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
-
-struct bch_sb_field_replication {
-	struct bch_sb_field	field;
-};
-
-/*
- * @offset	- sector where this sb was written
- * @version	- on disk format version
- * @magic	- identifies as a bcache superblock (BCACHE_MAGIC)
- * @seq		- incremented each time superblock is written
- * @uuid	- used for generating various magic numbers and identifying
- *                member devices, never changes
- * @user_uuid	- user visible UUID, may be changed
- * @label	- filesystem label
- * @seq		- identifies most recent superblock, incremented each time
- *		  superblock is written
- * @features	- enabled incompatible features
- */
-struct bch_sb {
-	struct bch_csum		csum;
-	__le64			version;
-	uuid_le			magic;
-	uuid_le			uuid;
-	uuid_le			user_uuid;
-	__u8			label[BCH_SB_LABEL_SIZE];
-	__le64			offset;
-	__le64			seq;
-
-	__le16			block_size;
-	__u8			dev_idx;
-	__u8			nr_devices;
-	__le32			u64s;
-
-	__le64			time_base_lo;
-	__le32			time_base_hi;
-	__le32			time_precision;
-
-	__le64			flags[8];
-	__le64			features[2];
-	__le64			compat[2];
-
-	struct bch_sb_layout	layout;
-
-	union {
-		struct bch_sb_field start[0];
-		__le64		_data[0];
-	};
-} __attribute__((packed, aligned(8)));
-
-/*
- * Flags:
- * BCH_SB_INITALIZED	- set on first mount
- * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
- *			  behaviour of mount/recovery path:
- * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
- * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
- * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
- *			   DATA/META_CSUM_TYPE. Also indicates encryption
- *			   algorithm in use, if/when we get more than one
- */
-
-LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
-LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
-LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
-LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
-
-LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
-
-LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
-LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
-
-LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
-LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE,	struct bch_sb, flags[0], 56, 60);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE,	struct bch_sb, flags[0], 60, 64);
-
-LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
-LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
-
-LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
-LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
-LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE,	struct bch_sb, flags[1], 14, 20);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
-
-/* Features: */
-enum bch_sb_features {
-	BCH_FEATURE_LZ4			= 0,
-	BCH_FEATURE_GZIP		= 1,
-};
-
-/* options: */
-
-#define BCH_REPLICAS_MAX		4U
-
-#if 0
-#define BCH_ERROR_ACTIONS()					\
-	x(BCH_ON_ERROR_CONTINUE,	0, "continue")		\
-	x(BCH_ON_ERROR_RO,		1, "remount-ro")	\
-	x(BCH_ON_ERROR_PANIC,		2, "panic")		\
-	x(BCH_NR_ERROR_ACTIONS,		3, NULL)
-
-enum bch_error_actions {
-#define x(_opt, _nr, _str)	_opt = _nr,
-	BCH_ERROR_ACTIONS()
-#undef x
-};
-#endif
-
-enum bch_error_actions {
-	BCH_ON_ERROR_CONTINUE		= 0,
-	BCH_ON_ERROR_RO			= 1,
-	BCH_ON_ERROR_PANIC		= 2,
-	BCH_NR_ERROR_ACTIONS		= 3,
-};
-
-enum bch_csum_opts {
-	BCH_CSUM_OPT_NONE		= 0,
-	BCH_CSUM_OPT_CRC32C		= 1,
-	BCH_CSUM_OPT_CRC64		= 2,
-	BCH_CSUM_OPT_NR			= 3,
-};
-
-enum bch_str_hash_opts {
-	BCH_STR_HASH_CRC32C		= 0,
-	BCH_STR_HASH_CRC64		= 1,
-	BCH_STR_HASH_SIPHASH		= 2,
-	BCH_STR_HASH_NR			= 3,
-};
-
-enum bch_compression_opts {
-	BCH_COMPRESSION_NONE		= 0,
-	BCH_COMPRESSION_LZ4		= 1,
-	BCH_COMPRESSION_GZIP		= 2,
-	BCH_COMPRESSION_NR		= 3,
-};
-
-/* backing device specific stuff: */
-
-struct backingdev_sb {
-	__le64			csum;
-	__le64			offset;	/* sector where this sb was written */
-	__le64			version; /* of on disk format */
-
-	uuid_le			magic;	/* bcache superblock UUID */
-
-	uuid_le			disk_uuid;
-
-	/*
-	 * Internal cache set UUID - xored with various magic numbers and thus
-	 * must never change:
-	 */
-	union {
-		uuid_le		set_uuid;
-		__le64		set_magic;
-	};
-	__u8			label[BCH_SB_LABEL_SIZE];
-
-	__le64			flags;
-
-	/* Incremented each time superblock is written: */
-	__le64			seq;
-
-	/*
-	 * User visible UUID for identifying the cache set the user is allowed
-	 * to change:
-	 *
-	 * XXX hooked up?
-	 */
-	uuid_le			user_uuid;
-	__le64			pad1[6];
-
-	__le64			data_offset;
-	__le16			block_size;	/* sectors */
-	__le16			pad2[3];
-
-	__le32			last_mount;	/* time_t */
-	__le16			pad3;
-	/* size of variable length portion - always 0 for backingdev superblock */
-	__le16			u64s;
-	__u64			_data[0];
-};
-
-LE64_BITMASK(BDEV_CACHE_MODE,		struct backingdev_sb, flags, 0, 4);
-#define CACHE_MODE_WRITETHROUGH		0U
-#define CACHE_MODE_WRITEBACK		1U
-#define CACHE_MODE_WRITEAROUND		2U
-#define CACHE_MODE_NONE			3U
-
-LE64_BITMASK(BDEV_STATE,		struct backingdev_sb, flags, 61, 63);
-#define BDEV_STATE_NONE			0U
-#define BDEV_STATE_CLEAN		1U
-#define BDEV_STATE_DIRTY		2U
-#define BDEV_STATE_STALE		3U
-
-#define BDEV_DATA_START_DEFAULT		16	/* sectors */
-
-static inline _Bool __SB_IS_BDEV(__u64 version)
-{
-	return version == BCACHE_SB_VERSION_BDEV
-		|| version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
-}
-
-static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
-{
-	return __SB_IS_BDEV(sb->version);
-}
-
-/*
- * Magic numbers
- *
- * The various other data structures have their own magic numbers, which are
- * xored with the first part of the cache set's UUID
- */
-
-#define BCACHE_MAGIC							\
-	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
-		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
-
-#define BCACHE_STATFS_MAGIC		0xca451a4e
-
-#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
-#define PSET_MAGIC		__cpu_to_le64(0x6750e15f87337f91ULL)
-#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
-
-static inline __le64 __bch_sb_magic(struct bch_sb *sb)
-{
-	__le64 ret;
-	memcpy(&ret, &sb->uuid, sizeof(ret));
-	return ret;
-}
-
-static inline __u64 __jset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch_sb_magic(sb) ^ JSET_MAGIC);
-}
-
-static inline __u64 __pset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch_sb_magic(sb) ^ PSET_MAGIC);
-}
-
-static inline __u64 __bset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch_sb_magic(sb) ^ BSET_MAGIC);
-}
-
-/* Journal */
-
-#define BCACHE_JSET_VERSION_UUIDv1	1
-#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_JKEYS	2
-#define BCACHE_JSET_VERSION		2
-
-struct jset_entry {
-	__le16			u64s;
-	__u8			btree_id;
-	__u8			level;
-	__le32			flags; /* designates what this jset holds */
-
-	union {
-		struct bkey_i	start[0];
-		__u64		_data[0];
-	};
-};
-
-#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
-
-LE32_BITMASK(JOURNAL_ENTRY_TYPE,	struct jset_entry, flags, 0, 8);
-enum {
-	JOURNAL_ENTRY_BTREE_KEYS	= 0,
-	JOURNAL_ENTRY_BTREE_ROOT	= 1,
-	JOURNAL_ENTRY_PRIO_PTRS		= 2,
-
-	/*
-	 * Journal sequence numbers can be blacklisted: bsets record the max
-	 * sequence number of all the journal entries they contain updates for,
-	 * so that on recovery we can ignore those bsets that contain index
-	 * updates newer that what made it into the journal.
-	 *
-	 * This means that we can't reuse that journal_seq - we have to skip it,
-	 * and then record that we skipped it so that the next time we crash and
-	 * recover we don't think there was a missing journal entry.
-	 */
-	JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
-};
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-	struct bch_csum		csum;
-
-	__le64			magic;
-	__le64			seq;
-	__le32			version;
-	__le32			flags;
-
-	__le32			u64s; /* size of d[] in u64s */
-
-	__u8			encrypted_start[0];
-
-	__le16			read_clock;
-	__le16			write_clock;
-
-	/* Sequence number of oldest dirty journal entry */
-	__le64			last_seq;
-
-
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
-} __attribute__((packed));
-
-LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
-LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
-
-#define BCH_JOURNAL_BUCKETS_MIN		20
-
-/* Bucket prios/gens */
-
-struct prio_set {
-	struct bch_csum		csum;
-
-	__le64			magic;
-	__le32			nonce[3];
-	__le16			version;
-	__le16			flags;
-
-	__u8			encrypted_start[0];
-
-	__le64			next_bucket;
-
-	struct bucket_disk {
-		__le16		read_prio;
-		__le16		write_prio;
-		__u8		gen;
-	} __attribute__((packed)) data[];
-} __attribute__((packed));
-
-LE32_BITMASK(PSET_CSUM_TYPE,	struct prio_set, flags, 0, 4);
-
-/* Btree: */
-
-#define DEFINE_BCH_BTREE_IDS()					\
-	DEF_BTREE_ID(EXTENTS, 0, "extents")			\
-	DEF_BTREE_ID(INODES,  1, "inodes")			\
-	DEF_BTREE_ID(DIRENTS, 2, "dirents")			\
-	DEF_BTREE_ID(XATTRS,  3, "xattrs")
-
-#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
-
-enum btree_id {
-	DEFINE_BCH_BTREE_IDS()
-	BTREE_ID_NR
-};
-
-#undef DEF_BTREE_ID
-
-#define BTREE_MAX_DEPTH		4U
-
-/* Btree nodes */
-
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_CSUM		1
-#define BCACHE_BSET_KEY_v1		2
-#define BCACHE_BSET_JOURNAL_SEQ		3
-#define BCACHE_BSET_VERSION		3
-
-/*
- * Btree nodes
- *
- * On disk a btree node is a list/log of these; within each set the keys are
- * sorted
- */
-struct bset {
-	__le64			seq;
-
-	/*
-	 * Highest journal entry this bset contains keys for.
-	 * If on recovery we don't see that journal entry, this bset is ignored:
-	 * this allows us to preserve the order of all index updates after a
-	 * crash, since the journal records a total order of all index updates
-	 * and anything that didn't make it to the journal doesn't get used.
-	 */
-	__le64			journal_seq;
-
-	__le32			flags;
-	__le16			version;
-	__le16			u64s; /* count of d[] in u64s */
-
-	union {
-		struct bkey_packed start[0];
-		__u64		_data[0];
-	};
-} __attribute__((packed));
-
-LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
-
-LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
-LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
-				struct bset, flags, 5, 6);
-
-struct btree_node {
-	struct bch_csum		csum;
-	__le64			magic;
-
-	/* this flags field is encrypted, unlike bset->flags: */
-	__le64			flags;
-
-	/* Closed interval: */
-	struct bpos		min_key;
-	struct bpos		max_key;
-	struct bch_extent_ptr	ptr;
-	struct bkey_format	format;
-
-	union {
-	struct bset		keys;
-	struct {
-		__u8		pad[22];
-		__le16		u64s;
-		__u64		_data[0];
-
-	};
-	};
-} __attribute__((packed));
-
-LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags, 0, 4);
-LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags, 4, 8);
-
-struct btree_node_entry {
-	struct bch_csum		csum;
-
-	union {
-	struct bset		keys;
-	struct {
-		__u8		pad[22];
-		__le16		u64s;
-		__u64		_data[0];
-
-	};
-	};
-} __attribute__((packed));
-
-/* OBSOLETE */
-
-#define BITMASK(name, type, field, offset, end)				\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
-static const __u64	name##_MAX = (1ULL << (end - offset)) - 1;	\
-									\
-static inline __u64 name(const type *k)					\
-{ return (k->field >> offset) & ~(~0ULL << (end - offset)); }		\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
-	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
-}
-
-struct bkey_v0 {
-	__u64	high;
-	__u64	low;
-	__u64	ptr[];
-};
-
-#define KEY0_FIELD(name, field, offset, size)				\
-	BITMASK(name, struct bkey_v0, field, offset, size)
-
-KEY0_FIELD(KEY0_PTRS,		high, 60, 63)
-KEY0_FIELD(KEY0_CSUM,		high, 56, 58)
-KEY0_FIELD(KEY0_DIRTY,		high, 36, 37)
-
-KEY0_FIELD(KEY0_SIZE,		high, 20, 36)
-KEY0_FIELD(KEY0_INODE,		high, 0,  20)
-
-static inline unsigned long bkey_v0_u64s(const struct bkey_v0 *k)
-{
-	return (sizeof(struct bkey_v0) / sizeof(__u64)) + KEY0_PTRS(k);
-}
-
-static inline struct bkey_v0 *bkey_v0_next(const struct bkey_v0 *k)
-{
-	__u64 *d = (__u64 *) k;
-
-	return (struct bkey_v0 *) (d + bkey_v0_u64s(k));
-}
-
-struct jset_v0 {
-	__u64			csum;
-	__u64			magic;
-	__u64			seq;
-	__u32			version;
-	__u32			keys;
-
-	__u64			last_seq;
-
-	__BKEY_PADDED(uuid_bucket, 4);
-	__BKEY_PADDED(btree_root, 4);
-	__u16			btree_level;
-	__u16			pad[3];
-
-	__u64			prio_bucket[64];
-
-	union {
-		struct bkey	start[0];
-		__u64		d[0];
-	};
-};
-
-/* UUIDS - per backing device/flash only volume metadata */
-
-struct uuid_entry_v0 {
-	uuid_le		uuid;
-	__u8		label[32];
-	__u32		first_reg;
-	__u32		last_reg;
-	__u32		invalidated;
-	__u32		pad;
-};
-
-struct uuid_entry {
-	union {
-		struct {
-			uuid_le	uuid;
-			__u8	label[32];
-			__u32	first_reg;
-			__u32	last_reg;
-			__u32	invalidated;
-
-			__u32	flags;
-			/* Size of flash only volumes */
-			__u64	sectors;
-		};
-
-		__u8		pad[128];
-	};
-};
-
-BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
-
-#ifdef __cplusplus
-}
-#endif
-#endif /* _LINUX_BCACHE_H */
-
-/* vim: set foldnestmax=2: */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 37a04a32..1c793b51 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -85,6 +85,12 @@ struct block_device {
 
 void generic_make_request(struct bio *);
 int submit_bio_wait(struct bio *);
+
+static inline void submit_bio(struct bio *bio)
+{
+	generic_make_request(bio);
+}
+
 int blkdev_issue_discard(struct block_device *, sector_t,
 			 sector_t, gfp_t, unsigned long);
 
diff --git a/include/linux/closure.h b/include/linux/closure.h
new file mode 100644
index 00000000..33280d30
--- /dev/null
+++ b/include/linux/closure.h
@@ -0,0 +1,385 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 27),
+	CLOSURE_DESTRUCTOR	= (1U << 27),
+	CLOSURE_WAITING		= (1U << 29),
+	CLOSURE_RUNNING		= (1U << 31),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct closure_syncer	*s;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+		__closure_sync(cl);
+}
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		queue_work(wq, &cl->work);
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+#define continue_at_noreturn(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	continue_at_noreturn(_cl, _fn, _wq);				\
+	return;								\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	closure_set_ip(_cl);						\
+	if (_wq) {							\
+		INIT_WORK(&(_cl)->work, (void *) _fn);			\
+		queue_work((_wq), &(_cl)->work);			\
+	} else {							\
+		(_fn)(_cl);						\
+	}								\
+	return;								\
+} while (0)
+
+#define closure_return_with_destructor_noreturn(_cl, _destructor)	\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	closure_return_with_destructor_noreturn(_cl, _destructor);	\
+	return;								\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcachefs.h
index b39fdde7..7dea9d63 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcachefs.h
@@ -1,52 +1,11 @@
 #undef TRACE_SYSTEM
-#define TRACE_SYSTEM bcache
+#define TRACE_SYSTEM bcachefs
 
 #if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BCACHE_H
 
 #include <linux/tracepoint.h>
 
-struct bcache_device;
-struct bio;
-struct bkey;
-struct btree;
-struct bch_dev;
-struct bch_fs;
-struct keylist;
-struct moving_queue;
-
-DECLARE_EVENT_CLASS(bcache_request,
-	TP_PROTO(struct bcache_device *d, struct bio *bio),
-	TP_ARGS(d, bio),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(unsigned int,	orig_major		)
-		__field(unsigned int,	orig_minor		)
-		__field(sector_t,	sector			)
-		__field(sector_t,	orig_sector		)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->orig_major	= d->disk->major;
-		__entry->orig_minor	= d->disk->first_minor;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
-	),
-
-	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->rwbs, (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->orig_major, __entry->orig_minor,
-		  (unsigned long long)__entry->orig_sector)
-);
-
 DECLARE_EVENT_CLASS(bpos,
 	TP_PROTO(struct bpos p),
 	TP_ARGS(p),
@@ -84,73 +43,47 @@ DECLARE_EVENT_CLASS(bkey,
 		  __entry->offset, __entry->size)
 );
 
-/* request.c */
-
-DEFINE_EVENT(bcache_request, bcache_request_start,
-	TP_PROTO(struct bcache_device *d, struct bio *bio),
-	TP_ARGS(d, bio)
-);
-
-DEFINE_EVENT(bcache_request, bcache_request_end,
-	TP_PROTO(struct bcache_device *d, struct bio *bio),
-	TP_ARGS(d, bio)
-);
-
-DECLARE_EVENT_CLASS(bcache_bio,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio),
+DECLARE_EVENT_CLASS(bch_dev,
+	TP_PROTO(struct bch_dev *ca),
+	TP_ARGS(ca),
 
 	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(sector_t,	sector			)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
+		__array(char,		uuid,	16	)
+		__field(unsigned,	tier		)
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+		__entry->tier = ca->mi.tier;
 	),
 
-	TP_printk("%d,%d  %s %llu + %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  (unsigned long long)__entry->sector, __entry->nr_sector)
+	TP_printk("%pU tier %u", __entry->uuid, __entry->tier)
 );
 
-DEFINE_EVENT(bcache_bio, bcache_bypass_sequential,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
+DECLARE_EVENT_CLASS(bch_fs,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c),
 
-DEFINE_EVENT(bcache_bio, bcache_bypass_congested,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16 )
+	),
 
-DEFINE_EVENT(bcache_bio, bcache_promote,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+	),
 
-DEFINE_EVENT(bkey, bcache_promote_collision,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+	TP_printk("%pU", __entry->uuid)
 );
 
-TRACE_EVENT(bcache_read,
-	TP_PROTO(struct bio *bio, bool hit, bool bypass),
-	TP_ARGS(bio, hit, bypass),
+DECLARE_EVENT_CLASS(bio,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
 		__field(sector_t,	sector			)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
-		__field(bool,		cache_hit		)
-		__field(bool,		bypass			)
 	),
 
 	TP_fast_assign(
@@ -159,49 +92,53 @@ TRACE_EVENT(bcache_read,
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
-		__entry->cache_hit = hit;
-		__entry->bypass = bypass;
 	),
 
-	TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->rwbs, (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->cache_hit, __entry->bypass)
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
-TRACE_EVENT(bcache_write,
-	TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio,
-		bool writeback, bool bypass),
-	TP_ARGS(c, inode, bio, writeback, bypass),
+DECLARE_EVENT_CLASS(page_alloc_fail,
+	TP_PROTO(struct bch_fs *c, u64 size),
+	TP_ARGS(c, size),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
-		__field(u64,		inode			)
-		__field(sector_t,	sector			)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
-		__field(bool,		writeback		)
-		__field(bool,		bypass			)
+		__array(char,		uuid,	16	)
+		__field(u64,		size		)
 	),
 
 	TP_fast_assign(
 		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->inode		= inode;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
-		__entry->writeback	= writeback;
-		__entry->bypass		= bypass;
+		__entry->size = size;
 	),
 
-	TP_printk("%pU inode %llu  %s %llu + %u hit %u bypass %u",
-		  __entry->uuid, __entry->inode,
-		  __entry->rwbs, (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->writeback, __entry->bypass)
+	TP_printk("%pU size %llu", __entry->uuid, __entry->size)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_split,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
 );
 
-TRACE_EVENT(bcache_write_throttle,
+DEFINE_EVENT(bio, read_bounce,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, promote,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(write_throttle,
 	TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay),
 	TP_ARGS(c, inode, bio, delay),
 
@@ -230,172 +167,24 @@ TRACE_EVENT(bcache_write_throttle,
 		  __entry->nr_sector, __entry->delay)
 );
 
-DEFINE_EVENT(bcache_bio, bcache_read_retry,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DECLARE_EVENT_CLASS(page_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		size		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->size = size;
-	),
-
-	TP_printk("%pU size %llu", __entry->uuid, __entry->size)
-);
-
 /* Journal */
 
-DECLARE_EVENT_CLASS(cache_set,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16 )
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-	),
-
-	TP_printk("%pU", __entry->uuid)
-);
-
-DEFINE_EVENT(bkey, bcache_journal_replay_key,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-TRACE_EVENT(bcache_journal_next_bucket,
-	TP_PROTO(struct bch_dev *ca, unsigned cur_idx, unsigned last_idx),
-	TP_ARGS(ca, cur_idx, last_idx),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(unsigned,	cur_idx		)
-		__field(unsigned,	last_idx	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->cur_idx	= cur_idx;
-		__entry->last_idx	= last_idx;
-	),
-
-	TP_printk("%pU cur %u last %u", __entry->uuid,
-		  __entry->cur_idx, __entry->last_idx)
-);
-
-TRACE_EVENT(bcache_journal_write_oldest,
-	TP_PROTO(struct bch_fs *c, u64 seq),
-	TP_ARGS(c, seq),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		seq		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->seq		= seq;
-	),
-
-	TP_printk("%pU seq %llu", __entry->uuid, __entry->seq)
-);
-
-TRACE_EVENT(bcache_journal_write_oldest_done,
-	TP_PROTO(struct bch_fs *c, u64 seq, unsigned written),
-	TP_ARGS(c, seq, written),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		seq		)
-		__field(unsigned,	written		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->seq		= seq;
-		__entry->written	= written;
-	),
-
-	TP_printk("%pU seq %llu written %u", __entry->uuid, __entry->seq,
-		  __entry->written)
-);
-
-DEFINE_EVENT(cache_set, bcache_journal_full,
+DEFINE_EVENT(bch_fs, journal_full,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache_set, bcache_journal_entry_full,
+DEFINE_EVENT(bch_fs, journal_entry_full,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(bcache_bio, bcache_journal_write,
+DEFINE_EVENT(bio, journal_write,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 
-/* Device state changes */
-
-DEFINE_EVENT(cache_set, fs_read_only,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(cache_set, fs_read_only_done,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DECLARE_EVENT_CLASS(cache,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(unsigned,	tier		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->tier = ca->mi.tier;
-	),
-
-	TP_printk("%pU tier %u", __entry->uuid, __entry->tier)
-);
-
-DEFINE_EVENT(cache, bcache_cache_read_only,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(cache, bcache_cache_read_only_done,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(cache, bcache_cache_read_write,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(cache, bcache_cache_read_write_done,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-/* Searching */
+/* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
 	TP_PROTO(struct bpos p),
@@ -431,12 +220,12 @@ DECLARE_EVENT_CLASS(btree_node,
 		  __entry->inode, __entry->offset)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_read,
+DEFINE_EVENT(btree_node, btree_read,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-TRACE_EVENT(bcache_btree_write,
+TRACE_EVENT(btree_write,
 	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
 	TP_ARGS(b, bytes, sectors),
 
@@ -456,34 +245,17 @@ TRACE_EVENT(bcache_btree_write,
 		  __entry->type , __entry->bytes, __entry->sectors)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
+DEFINE_EVENT(btree_node, btree_node_alloc,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-TRACE_EVENT(bcache_btree_node_alloc_fail,
-	TP_PROTO(struct bch_fs *c, enum btree_id id),
-	TP_ARGS(c, id),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
-		__field(enum btree_id,	id			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->id = id;
-	),
-
-	TP_printk("%pU id %u", __entry->uuid, __entry->id)
-);
-
-DEFINE_EVENT(btree_node, bcache_btree_node_free,
+DEFINE_EVENT(btree_node, btree_node_free,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-TRACE_EVENT(bcache_mca_reap,
+TRACE_EVENT(btree_node_reap,
 	TP_PROTO(struct bch_fs *c, struct btree *b, int ret),
 	TP_ARGS(c, b, ret),
 
@@ -500,33 +272,7 @@ TRACE_EVENT(bcache_mca_reap,
 	TP_printk("bucket %llu ret %d", __entry->bucket, __entry->ret)
 );
 
-TRACE_EVENT(bcache_mca_scan,
-	TP_PROTO(struct bch_fs *c, unsigned touched, unsigned freed,
-		 unsigned can_free, unsigned long nr),
-	TP_ARGS(c, touched, freed, can_free, nr),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(unsigned long,	touched		)
-		__field(unsigned long,	freed		)
-		__field(unsigned long,	can_free	)
-		__field(unsigned long,	nr		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->touched	= touched;
-		__entry->freed		= freed;
-		__entry->can_free	= can_free;
-		__entry->nr		= nr;
-	),
-
-	TP_printk("%pU touched %lu freed %lu can_free %lu nr %lu",
-		  __entry->uuid, __entry->touched, __entry->freed,
-		  __entry->can_free, __entry->nr)
-);
-
-DECLARE_EVENT_CLASS(mca_cannibalize_lock,
+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c),
 
@@ -541,27 +287,47 @@ DECLARE_EVENT_CLASS(mca_cannibalize_lock,
 	TP_printk("%pU", __entry->uuid)
 );
 
-DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock_fail,
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock,
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize,
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-TRACE_EVENT(bcache_btree_insert_key,
+TRACE_EVENT(btree_reserve_get_fail,
+	TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
+	TP_ARGS(c, required, cl),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+		__field(size_t,			required	)
+		__field(struct closure *,	cl		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->required = required;
+		__entry->cl = cl;
+	),
+
+	TP_printk("%pU required %zu by %p", __entry->uuid,
+		  __entry->required, __entry->cl)
+);
+
+TRACE_EVENT(btree_insert_key,
 	TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
 	TP_ARGS(c, b, k),
 
@@ -620,24 +386,24 @@ DECLARE_EVENT_CLASS(btree_split,
 		  __entry->inode, __entry->offset, __entry->keys)
 );
 
-DEFINE_EVENT(btree_split, bcache_btree_node_split,
+DEFINE_EVENT(btree_split, btree_node_split,
 	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys),
 	TP_ARGS(c, b, keys)
 );
 
-DEFINE_EVENT(btree_split, bcache_btree_node_compact,
+DEFINE_EVENT(btree_split, btree_node_compact,
 	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys),
 	TP_ARGS(c, b, keys)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_set_root,
+DEFINE_EVENT(btree_node, btree_set_root,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
 /* Garbage collection */
 
-TRACE_EVENT(bcache_btree_gc_coalesce,
+TRACE_EVENT(btree_gc_coalesce,
 	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned nodes),
 	TP_ARGS(c, b, nodes),
 
@@ -664,7 +430,7 @@ TRACE_EVENT(bcache_btree_gc_coalesce,
 		  __entry->inode, __entry->offset, __entry->nodes)
 );
 
-TRACE_EVENT(bcache_btree_gc_coalesce_fail,
+TRACE_EVENT(btree_gc_coalesce_fail,
 	TP_PROTO(struct bch_fs *c, int reason),
 	TP_ARGS(c, reason),
 
@@ -681,119 +447,54 @@ TRACE_EVENT(bcache_btree_gc_coalesce_fail,
 	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
 );
 
-TRACE_EVENT(bcache_btree_node_alloc_replacement,
-	TP_PROTO(struct bch_fs *c, struct btree *old, struct btree *b),
-	TP_ARGS(c, old, b),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,		16	)
-		__field(u64,		bucket			)
-		__field(u64,		old_bucket		)
-		__field(u8,		level			)
-		__field(u8,		id			)
-		__field(u32,		inode			)
-		__field(u64,		offset			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->old_bucket	= PTR_BUCKET_NR_TRACE(c,
-							      &old->key, 0);
-		__entry->bucket		= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
-		__entry->level		= b->level;
-		__entry->id		= b->btree_id;
-		__entry->inode		= b->key.k.p.inode;
-		__entry->offset		= b->key.k.p.offset;
-	),
-
-	TP_printk("%pU for %llu bucket %llu(%u) id %u: %u:%llu",
-		  __entry->uuid, __entry->old_bucket, __entry->bucket,
-		  __entry->level, __entry->id,
-		  __entry->inode, __entry->offset)
-);
-
-DEFINE_EVENT(btree_node, bcache_btree_gc_rewrite_node,
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_gc_rewrite_node_fail,
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(cache_set, bcache_gc_start,
+DEFINE_EVENT(bch_fs, gc_start,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache_set, bcache_gc_end,
+DEFINE_EVENT(bch_fs, gc_end,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache_set, bcache_gc_coalesce_start,
+DEFINE_EVENT(bch_fs, gc_coalesce_start,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache_set, bcache_gc_coalesce_end,
+DEFINE_EVENT(bch_fs, gc_coalesce_end,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache, bcache_sectors_saturated,
+DEFINE_EVENT(bch_dev, sectors_saturated,
 	TP_PROTO(struct bch_dev *ca),
 	TP_ARGS(ca)
 );
 
-DEFINE_EVENT(cache_set, bcache_gc_sectors_saturated,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(cache_set, bcache_gc_cannot_inc_gens,
+DEFINE_EVENT(bch_fs, gc_sectors_saturated,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(cache_set, bcache_gc_periodic,
+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-TRACE_EVENT(bcache_mark_bucket,
-	TP_PROTO(struct bch_dev *ca, const struct bkey *k,
-		 const struct bch_extent_ptr *ptr,
-		 int sectors, bool dirty),
-	TP_ARGS(ca, k, ptr, sectors, dirty),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,		16	)
-		__field(u32,		inode			)
-		__field(u64,		offset			)
-		__field(u32,		sectors			)
-		__field(u64,		bucket			)
-		__field(bool,		dirty			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->inode		= k->p.inode;
-		__entry->offset		= k->p.offset;
-		__entry->sectors	= sectors;
-		__entry->bucket		= PTR_BUCKET_NR(ca, ptr);
-		__entry->dirty		= dirty;
-	),
-
-	TP_printk("%pU %u:%llu sectors %i bucket %llu dirty %i",
-		  __entry->uuid, __entry->inode, __entry->offset,
-		  __entry->sectors, __entry->bucket, __entry->dirty)
-);
-
 /* Allocator */
 
-TRACE_EVENT(bcache_alloc_batch,
+TRACE_EVENT(alloc_batch,
 	TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
 	TP_ARGS(ca, free, total),
 
@@ -813,37 +514,17 @@ TRACE_EVENT(bcache_alloc_batch,
 		__entry->uuid, __entry->free, __entry->total)
 );
 
-TRACE_EVENT(bcache_btree_reserve_get_fail,
-	TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
-	TP_ARGS(c, required, cl),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-		__field(size_t,			required	)
-		__field(struct closure *,	cl		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->required = required;
-		__entry->cl = cl;
-	),
-
-	TP_printk("%pU required %zu by %p", __entry->uuid,
-		  __entry->required, __entry->cl)
-);
-
-DEFINE_EVENT(cache, bcache_prio_write_start,
+DEFINE_EVENT(bch_dev, prio_write_start,
 	TP_PROTO(struct bch_dev *ca),
 	TP_ARGS(ca)
 );
 
-DEFINE_EVENT(cache, bcache_prio_write_end,
+DEFINE_EVENT(bch_dev, prio_write_end,
 	TP_PROTO(struct bch_dev *ca),
 	TP_ARGS(ca)
 );
 
-TRACE_EVENT(bcache_invalidate,
+TRACE_EVENT(invalidate,
 	TP_PROTO(struct bch_dev *ca, size_t bucket, unsigned sectors),
 	TP_ARGS(ca, bucket, sectors),
 
@@ -864,12 +545,12 @@ TRACE_EVENT(bcache_invalidate,
 		  MINOR(__entry->dev), __entry->offset)
 );
 
-DEFINE_EVENT(cache_set, bcache_rescale_prios,
+DEFINE_EVENT(bch_fs, rescale_prios,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DECLARE_EVENT_CLASS(cache_bucket_alloc,
+DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
 	TP_ARGS(ca, reserve),
 
@@ -886,17 +567,17 @@ DECLARE_EVENT_CLASS(cache_bucket_alloc,
 	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
 );
 
-DEFINE_EVENT(cache_bucket_alloc, bcache_bucket_alloc,
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
 	TP_ARGS(ca, reserve)
 );
 
-DEFINE_EVENT(cache_bucket_alloc, bcache_bucket_alloc_fail,
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
 	TP_ARGS(ca, reserve)
 );
 
-TRACE_EVENT(bcache_freelist_empty_fail,
+TRACE_EVENT(freelist_empty_fail,
 	TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve,
 		 struct closure *cl),
 	TP_ARGS(c, reserve, cl),
@@ -935,47 +616,16 @@ DECLARE_EVENT_CLASS(open_bucket_alloc,
 		  __entry->uuid, __entry->cl)
 );
 
-DEFINE_EVENT(open_bucket_alloc, bcache_open_bucket_alloc,
+DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc,
 	TP_PROTO(struct bch_fs *c, struct closure *cl),
 	TP_ARGS(c, cl)
 );
 
-DEFINE_EVENT(open_bucket_alloc, bcache_open_bucket_alloc_fail,
+DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail,
 	TP_PROTO(struct bch_fs *c, struct closure *cl),
 	TP_ARGS(c, cl)
 );
 
-/* Keylists */
-
-TRACE_EVENT(bcache_keyscan,
-	TP_PROTO(unsigned nr_found,
-		 unsigned start_inode, u64 start_offset,
-		 unsigned end_inode, u64 end_offset),
-	TP_ARGS(nr_found,
-		start_inode, start_offset,
-		end_inode, end_offset),
-
-	TP_STRUCT__entry(
-		__field(__u32,	nr_found			)
-		__field(__u32,	start_inode			)
-		__field(__u64,	start_offset			)
-		__field(__u32,	end_inode			)
-		__field(__u64,	end_offset			)
-	),
-
-	TP_fast_assign(
-		__entry->nr_found	= nr_found;
-		__entry->start_inode	= start_inode;
-		__entry->start_offset	= start_offset;
-		__entry->end_inode	= end_inode;
-		__entry->end_offset	= end_offset;
-	),
-
-	TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found,
-		  __entry->start_inode, __entry->start_offset,
-		  __entry->end_inode, __entry->end_offset)
-);
-
 /* Moving IO */
 
 DECLARE_EVENT_CLASS(moving_io,
@@ -998,44 +648,39 @@ DECLARE_EVENT_CLASS(moving_io,
 		  __entry->inode, __entry->offset, __entry->sectors)
 );
 
-DEFINE_EVENT(moving_io, bcache_move_read,
+DEFINE_EVENT(moving_io, move_read,
 	TP_PROTO(struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(moving_io, bcache_move_read_done,
+DEFINE_EVENT(moving_io, move_read_done,
 	TP_PROTO(struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(moving_io, bcache_move_write,
+DEFINE_EVENT(moving_io, move_write,
 	TP_PROTO(struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(moving_io, bcache_move_write_done,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(moving_io, bcache_copy_collision,
+DEFINE_EVENT(moving_io, copy_collision,
 	TP_PROTO(struct bkey *k),
 	TP_ARGS(k)
 );
 
 /* Copy GC */
 
-DEFINE_EVENT(page_alloc_fail, bcache_moving_gc_alloc_fail,
+DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail,
 	TP_PROTO(struct bch_fs *c, u64 size),
 	TP_ARGS(c, size)
 );
 
-DEFINE_EVENT(cache, bcache_moving_gc_start,
+DEFINE_EVENT(bch_dev, moving_gc_start,
 	TP_PROTO(struct bch_dev *ca),
 	TP_ARGS(ca)
 );
 
-TRACE_EVENT(bcache_moving_gc_end,
+TRACE_EVENT(moving_gc_end,
 	TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved,
 		u64 buckets_moved),
 	TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
@@ -1059,44 +704,24 @@ TRACE_EVENT(bcache_moving_gc_end,
 		__entry->buckets_moved)
 );
 
-DEFINE_EVENT(cache, bcache_moving_gc_reserve_empty,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(cache, bcache_moving_gc_no_work,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bkey, bcache_gc_copy,
+DEFINE_EVENT(bkey, gc_copy,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
 /* Tiering */
 
-DEFINE_EVENT(cache_set, bcache_tiering_refill_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(cache_set, bcache_tiering_refill_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(page_alloc_fail, bcache_tiering_alloc_fail,
+DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail,
 	TP_PROTO(struct bch_fs *c, u64 size),
 	TP_ARGS(c, size)
 );
 
-DEFINE_EVENT(cache_set, bcache_tiering_start,
+DEFINE_EVENT(bch_fs, tiering_start,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-TRACE_EVENT(bcache_tiering_end,
+TRACE_EVENT(tiering_end,
 	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
 		u64 keys_moved),
 	TP_ARGS(c, sectors_moved, keys_moved),
@@ -1117,54 +742,11 @@ TRACE_EVENT(bcache_tiering_end,
 		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
 );
 
-DEFINE_EVENT(bkey, bcache_tiering_copy,
+DEFINE_EVENT(bkey, tiering_copy,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
-/* Background writeback */
-
-DEFINE_EVENT(bkey, bcache_writeback,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(bkey, bcache_writeback_collision,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-TRACE_EVENT(bcache_writeback_error,
-	TP_PROTO(struct bkey *k, bool write, int error),
-	TP_ARGS(k, write, error),
-
-	TP_STRUCT__entry(
-		__field(u32,	size				)
-		__field(u32,	inode				)
-		__field(u64,	offset				)
-		__field(bool,	write				)
-		__field(int,	error				)
-	),
-
-	TP_fast_assign(
-		__entry->inode	= k->p.inode;
-		__entry->offset	= k->p.offset;
-		__entry->size	= k->size;
-		__entry->write	= write;
-		__entry->error	= error;
-	),
-
-	TP_printk("%u:%llu len %u %s error %d", __entry->inode,
-		  __entry->offset, __entry->size,
-		  __entry->write ? "write" : "read",
-		  __entry->error)
-);
-
-DEFINE_EVENT(page_alloc_fail, bcache_writeback_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size)
-);
-
 #endif /* _TRACE_BCACHE_H */
 
 /* This part must be outside protection */
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-19 15:56:34 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-19 17:31:47 -0800
commit	5ec39af8eaba49aee7bafa44c661da39e2f40dc3 (patch)
tree	1fb1a981602cbf22c7d2b2dba1168c715d7cecb5 /include
parent	bb1941de5378a7b8122d3575dcbc7d0aeb6326f0 (diff)