diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-19 15:56:34 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-19 17:31:47 -0800 |
commit | 5ec39af8eaba49aee7bafa44c661da39e2f40dc3 (patch) | |
tree | 1fb1a981602cbf22c7d2b2dba1168c715d7cecb5 /include | |
parent | bb1941de5378a7b8122d3575dcbc7d0aeb6326f0 (diff) |
Rename from bcache-tools to bcachefs-tools
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/bcache-ioctl.h | 104 | ||||
-rw-r--r-- | include/linux/bcache.h | 1449 | ||||
-rw-r--r-- | include/linux/blkdev.h | 6 | ||||
-rw-r--r-- | include/linux/closure.h | 385 | ||||
-rw-r--r-- | include/trace/events/bcachefs.h (renamed from include/trace/events/bcache.h) | 672 |
5 files changed, 518 insertions, 2098 deletions
diff --git a/include/linux/bcache-ioctl.h b/include/linux/bcache-ioctl.h deleted file mode 100644 index ca769369..00000000 --- a/include/linux/bcache-ioctl.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef _LINUX_BCACHE_IOCTL_H -#define _LINUX_BCACHE_IOCTL_H - -#include <linux/bcache.h> -#include <linux/uuid.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define BCH_FORCE_IF_DATA_LOST (1 << 0) -#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) - -#define BCH_FORCE_IF_DEGRADED \ - (BCH_FORCE_IF_DATA_DEGRADED| \ - BCH_FORCE_IF_METADATA_DEGRADED) - -#define BCH_BY_UUID (1 << 4) - -/* global control dev: */ - -#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) - -struct bch_ioctl_assemble { - __u32 flags; - __u32 nr_devs; - __u64 pad; - __u64 devs[]; -}; - -struct bch_ioctl_incremental { - __u32 flags; - __u64 pad; - __u64 dev; -}; - -/* filesystem ioctls: */ - -#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) -#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -#define BCH_IOCTL_STOP _IO(0xbc, 3) -#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -#define BCH_IOCTL_DISK_EVACUATE _IOW(0xbc, 9, struct bch_ioctl_disk) -#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) - -struct bch_ioctl_query_uuid { - uuid_le uuid; -}; - -struct bch_ioctl_start { - __u32 flags; - __u32 pad; -}; - -struct bch_ioctl_disk { - __u32 flags; - __u32 pad; - __u64 dev; -}; - -struct bch_ioctl_disk_set_state { - __u32 flags; - __u8 new_state; - __u8 pad[3]; - __u64 dev; -}; - -#define BCH_REWRITE_INCREASE_REPLICAS (1 << 0) -#define BCH_REWRITE_DECREASE_REPLICAS (1 << 1) - -#define BCH_REWRITE_RECOMPRESS (1 << 0) -#define BCH_REWRITE_DECREASE_REPLICAS (1 << 1) - -enum bch_data_ops { - BCH_DATA_SCRUB, -}; - -struct bch_data_op { - __u8 type; -}; - -struct bch_ioctl_data { - __u32 flags; - __u32 pad; - - __u64 start_inode; - __u64 start_offset; - - __u64 end_inode; - __u64 end_offset; -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _LINUX_BCACHE_IOCTL_H */ diff --git a/include/linux/bcache.h b/include/linux/bcache.h deleted file mode 100644 index c221747b..00000000 --- a/include/linux/bcache.h +++ /dev/null @@ -1,1449 +0,0 @@ -#ifndef _LINUX_BCACHE_H -#define _LINUX_BCACHE_H - -/* - * Bcache on disk data structures - */ - -#ifdef __cplusplus -typedef bool _Bool; -extern "C" { -#endif - -#include <asm/types.h> -#include <asm/byteorder.h> -#include <linux/uuid.h> - -#define LE32_BITMASK(name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ -static const __u64 name##_MAX = (1ULL << (end - offset)) - 1; \ - \ -static inline __u64 name(const type *k) \ -{ \ - return (__le32_to_cpu(k->field) >> offset) & \ - ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - __u64 new = __le32_to_cpu(k->field); \ - \ - new &= ~(~(~0ULL << (end - offset)) << offset); \ - new |= (v & ~(~0ULL << (end - offset))) << offset; \ - k->field = __cpu_to_le32(new); \ -} - -#define LE64_BITMASK(name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ -static const __u64 name##_MAX = (1ULL << (end - offset)) - 1; \ - \ -static inline __u64 name(const type *k) \ -{ \ - return (__le64_to_cpu(k->field) >> offset) & \ - ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - __u64 new = __le64_to_cpu(k->field); \ - \ - new &= ~(~(~0ULL << (end - offset)) << offset); \ - new |= (v & ~(~0ULL << (end - offset))) << offset; \ - k->field = __cpu_to_le64(new); \ -} - -struct bkey_format { - __u8 key_u64s; - __u8 nr_fields; - /* One unused slot for now: */ - __u8 bits_per_field[6]; - __le64 field_offset[6]; -}; - -/* Btree keys - all units are in sectors */ - -struct bpos { - /* Word order matches machine byte order */ -#if defined(__LITTLE_ENDIAN) - __u32 snapshot; - __u64 offset; - __u64 inode; -#elif defined(__BIG_ENDIAN) - __u64 inode; - __u64 offset; /* Points to end of extent - sectors */ - __u32 snapshot; -#else -#error edit for your odd byteorder. -#endif -} __attribute__((packed, aligned(4))); - -#define KEY_INODE_MAX ((__u64)~0ULL) -#define KEY_OFFSET_MAX ((__u64)~0ULL) -#define KEY_SNAPSHOT_MAX ((__u32)~0U) - -static inline struct bpos POS(__u64 inode, __u64 offset) -{ - struct bpos ret; - - ret.inode = inode; - ret.offset = offset; - ret.snapshot = 0; - - return ret; -} - -#define POS_MIN POS(0, 0) -#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) - -/* Empty placeholder struct, for container_of() */ -struct bch_val { - __u64 __nothing[0]; -}; - -struct bversion { -#if defined(__LITTLE_ENDIAN) - __u64 lo; - __u32 hi; -#elif defined(__BIG_ENDIAN) - __u32 hi; - __u64 lo; -#endif -} __attribute__((packed, aligned(4))); - -struct bkey { - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#else -#error edit for your odd byteorder. -#endif - - /* Type of the value */ - __u8 type; - -#if defined(__LITTLE_ENDIAN) - __u8 pad[1]; - - struct bversion version; - __u32 size; /* extent size, in sectors */ - struct bpos p; -#elif defined(__BIG_ENDIAN) - struct bpos p; - __u32 size; /* extent size, in sectors */ - struct bversion version; - - __u8 pad[1]; -#endif -} __attribute__((packed, aligned(8))); - -struct bkey_packed { - __u64 _data[0]; - - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ - - /* - * XXX: next incompat on disk format change, switch format and - * needs_whiteout - bkey_packed() will be cheaper if format is the high - * bits of the bitfield - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#endif - - /* Type of the value */ - __u8 type; - __u8 key_start[0]; - - /* - * We copy bkeys with struct assignment in various places, and while - * that shouldn't be done with packed bkeys we can't disallow it in C, - * and it's legal to cast a bkey to a bkey_packed - so padding it out - * to the same size as struct bkey should hopefully be safest. - */ - __u8 pad[sizeof(struct bkey) - 3]; -} __attribute__((packed, aligned(8))); - -#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -#define KEY_PACKED_BITS_START 24 - -#define KEY_SIZE_MAX ((__u32)~0U) - -#define KEY_FORMAT_LOCAL_BTREE 0 -#define KEY_FORMAT_CURRENT 1 - -enum bch_bkey_fields { - BKEY_FIELD_INODE, - BKEY_FIELD_OFFSET, - BKEY_FIELD_SNAPSHOT, - BKEY_FIELD_SIZE, - BKEY_FIELD_VERSION_HI, - BKEY_FIELD_VERSION_LO, - BKEY_NR_FIELDS, -}; - -#define bkey_format_field(name, field) \ - [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) - -#define BKEY_FORMAT_CURRENT \ -((struct bkey_format) { \ - .key_u64s = BKEY_U64s, \ - .nr_fields = BKEY_NR_FIELDS, \ - .bits_per_field = { \ - bkey_format_field(INODE, p.inode), \ - bkey_format_field(OFFSET, p.offset), \ - bkey_format_field(SNAPSHOT, p.snapshot), \ - bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, version.hi), \ - bkey_format_field(VERSION_LO, version.lo), \ - }, \ -}) - -/* bkey with inline value */ -struct bkey_i { - __u64 _data[0]; - - union { - struct { - /* Size of combined key and value, in u64s */ - __u8 u64s; - }; - struct { - struct bkey k; - struct bch_val v; - }; - }; -}; - -#ifndef __cplusplus - -#define KEY(_inode, _offset, _size) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = POS(_inode, _offset), \ - .size = _size, \ -}) - -#else - -static inline struct bkey KEY(__u64 inode, __u64 offset, __u64 size) -{ - struct bkey ret; - - memset(&ret, 0, sizeof(ret)); - ret.u64s = BKEY_U64s; - ret.format = KEY_FORMAT_CURRENT; - ret.p.inode = inode; - ret.p.offset = offset; - ret.size = size; - - return ret; -} - -#endif - -static inline void bkey_init(struct bkey *k) -{ - *k = KEY(0, 0, 0); -} - -#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) - -#define __BKEY_PADDED(key, pad) \ - struct { struct bkey_i key; __u64 key ## _pad[pad]; } - -#define BKEY_VAL_TYPE(name, nr) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -} - -/* - * - DELETED keys are used internally to mark keys that should be ignored but - * override keys in composition order. Their version number is ignored. - * - * - DISCARDED keys indicate that the data is all 0s because it has been - * discarded. DISCARDs may have a version; if the version is nonzero the key - * will be persistent, otherwise the key will be dropped whenever the btree - * node is rewritten (like DELETED keys). - * - * - ERROR: any read of the data returns a read error, as the data was lost due - * to a failing device. Like DISCARDED keys, they can be removed (overridden) - * by new writes or cluster-wide GC. Node repair can also overwrite them with - * the same or a more recent version number, but not with an older version - * number. -*/ -#define KEY_TYPE_DELETED 0 -#define KEY_TYPE_DISCARD 1 -#define KEY_TYPE_ERROR 2 -#define KEY_TYPE_COOKIE 3 -#define KEY_TYPE_PERSISTENT_DISCARD 4 -#define KEY_TYPE_GENERIC_NR 128 - -struct bch_cookie { - struct bch_val v; - __le64 cookie; -}; -BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE); - -/* Extents */ - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the start of the data that - * is currently live. The size field in struct bkey records the current (live) - * size of the extent, and is also used to mean "size of region on disk that we - * point to" in this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - -/* 128 bits, sufficient for cryptographic MACs: */ -struct bch_csum { - __le64 lo; - __le64 hi; -} __attribute__((packed, aligned(8))); - -#define BCH_CSUM_NONE 0U -#define BCH_CSUM_CRC32C 1U -#define BCH_CSUM_CRC64 2U -#define BCH_CSUM_CHACHA20_POLY1305_80 3U -#define BCH_CSUM_CHACHA20_POLY1305_128 4U -#define BCH_CSUM_NR 5U - -static inline _Bool bch_csum_type_is_encryption(unsigned type) -{ - switch (type) { - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: - return true; - default: - return false; - } -} - -enum bch_extent_entry_type { - BCH_EXTENT_ENTRY_ptr = 0, - BCH_EXTENT_ENTRY_crc32 = 1, - BCH_EXTENT_ENTRY_crc64 = 2, - BCH_EXTENT_ENTRY_crc128 = 3, -}; - -#define BCH_EXTENT_ENTRY_MAX 4 - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __attribute__((packed, aligned(8))); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __attribute__((packed, aligned(8))); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:14, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:3; -#endif - struct bch_csum csum; -} __attribute__((packed, aligned(8))); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * Max size of an extent that may require bouncing to read or write - * (checksummed, compressed): 64k - */ -#define BCH_ENCODED_EXTENT_MAX 128U - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - erasure_coded:1, - reservation:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - reservation:1, - erasure_coded:1, - cached:1, - type:1; -#endif -} __attribute__((packed, aligned(8))); - -struct bch_extent_reservation { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - unused:23, - replicas:4, - generation:32; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 generation:32, - replicas:4, - unused:23, - type:5; -#endif -}; - -union bch_extent_entry { -#if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; - struct bch_extent_ptr ptr; -}; - -enum { - BCH_EXTENT = 128, - - /* - * This is kind of a hack, we're overloading the type for a boolean that - * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED - * have the same value type: - */ - BCH_EXTENT_CACHED = 129, - - /* - * Persistent reservation: - */ - BCH_RESERVATION = 130, -}; - -struct bch_extent { - struct bch_val v; - - union bch_extent_entry start[0]; - __u64 _data[0]; -} __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(extent, BCH_EXTENT); - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(reservation, BCH_RESERVATION); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(u64)) - -/* Maximum possible size of an entire extent value: */ -/* There's a hack in the keylist code that needs to be fixed.. */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -/* Inodes */ - -#define BLOCKDEV_INODE_MAX 4096 - -#define BCACHE_ROOT_INO 4096 - -enum bch_inode_types { - BCH_INODE_FS = 128, - BCH_INODE_BLOCKDEV = 129, -}; - -struct bch_inode { - struct bch_val v; - - __le64 i_hash_seed; - __le32 i_flags; - __le16 i_mode; - __u8 fields[0]; -} __attribute__((packed)); -BKEY_VAL_TYPE(inode, BCH_INODE_FS); - -#define BCH_INODE_FIELDS() \ - BCH_INODE_FIELD(i_atime, 64) \ - BCH_INODE_FIELD(i_ctime, 64) \ - BCH_INODE_FIELD(i_mtime, 64) \ - BCH_INODE_FIELD(i_otime, 64) \ - BCH_INODE_FIELD(i_size, 64) \ - BCH_INODE_FIELD(i_sectors, 64) \ - BCH_INODE_FIELD(i_uid, 32) \ - BCH_INODE_FIELD(i_gid, 32) \ - BCH_INODE_FIELD(i_nlink, 32) \ - BCH_INODE_FIELD(i_generation, 32) \ - BCH_INODE_FIELD(i_dev, 32) - -enum { - /* - * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL - * flags) - */ - __BCH_INODE_SYNC = 0, - __BCH_INODE_IMMUTABLE = 1, - __BCH_INODE_APPEND = 2, - __BCH_INODE_NODUMP = 3, - __BCH_INODE_NOATIME = 4, - - __BCH_INODE_I_SIZE_DIRTY= 5, - __BCH_INODE_I_SECTORS_DIRTY= 6, - - /* not implemented yet: */ - __BCH_INODE_HAS_XATTRS = 7, /* has xattrs in xattr btree */ - - /* bits 20+ reserved for packed fields below: */ -}; - -#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -#define BCH_INODE_HAS_XATTRS (1 << __BCH_INODE_HAS_XATTRS) - -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, i_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, i_flags, 24, 32); - -struct bch_inode_blockdev { - struct bch_val v; - - __le64 i_size; - __le64 i_flags; - - /* Seconds: */ - __le64 i_ctime; - __le64 i_mtime; - - uuid_le i_uuid; - __u8 i_label[32]; -} __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(inode_blockdev, BCH_INODE_BLOCKDEV); - -/* Thin provisioned volume, or cache for another block device? */ -LE64_BITMASK(CACHED_DEV, struct bch_inode_blockdev, i_flags, 0, 1) - -/* Dirents */ - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -enum { - BCH_DIRENT = 128, - BCH_DIRENT_WHITEOUT = 129, -}; - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - __le64 d_inum; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ - __u8 d_type; - - __u8 d_name[]; -} __attribute__((packed)); -BKEY_VAL_TYPE(dirent, BCH_DIRENT); - -/* Xattrs */ - -enum { - BCH_XATTR = 128, - BCH_XATTR_WHITEOUT = 129, -}; - -#define BCH_XATTR_INDEX_USER 0 -#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define BCH_XATTR_INDEX_TRUSTED 3 -#define BCH_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - __u8 x_name[]; -} __attribute__((packed)); -BKEY_VAL_TYPE(xattr, BCH_XATTR); - -/* Superblock */ - -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - * Version 5: All the incompat changes - * Version 6: Cache device UUIDs all in superblock, another incompat bset change - * Version 7: Encryption (expanded checksum fields), other random things - */ -#define BCACHE_SB_VERSION_CDEV_V0 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_VERSION_CDEV_V2 5 -#define BCACHE_SB_VERSION_CDEV_V3 6 -#define BCACHE_SB_VERSION_CDEV_V4 7 -#define BCACHE_SB_VERSION_CDEV 7 -#define BCACHE_SB_MAX_VERSION 7 - -#define BCH_SB_SECTOR 8 -#define BCH_SB_LABEL_SIZE 32 -#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ - -struct bch_member { - uuid_le uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __le32 pad; - __le64 last_mount; /* time_t */ - - __le64 flags[2]; -}; - -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) -LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9) -LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10) -LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15); - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -enum bch_member_state { - BCH_MEMBER_STATE_RW = 0, - BCH_MEMBER_STATE_RO = 1, - BCH_MEMBER_STATE_FAILED = 2, - BCH_MEMBER_STATE_SPARE = 3, - BCH_MEMBER_STATE_NR = 4, -}; - -#define BCH_TIER_MAX 4U - -enum cache_replacement { - CACHE_REPLACEMENT_LRU = 0, - CACHE_REPLACEMENT_FIFO = 1, - CACHE_REPLACEMENT_RANDOM = 2, - CACHE_REPLACEMENT_NR = 3, -}; - -struct bch_sb_layout { - uuid_le magic; /* bcache superblock UUID */ - __u8 layout_type; - __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ - __u8 nr_superblocks; - __u8 pad[5]; - __u64 sb_offset[61]; -} __attribute__((packed)); - -#define BCH_SB_LAYOUT_SECTOR 7 - -struct bch_sb_field { - __u64 _data[0]; - __le32 u64s; - __le32 type; -}; - -enum bch_sb_field_type { - BCH_SB_FIELD_journal = 0, - BCH_SB_FIELD_members = 1, - BCH_SB_FIELD_crypt = 2, - BCH_SB_FIELD_NR = 3, -}; - -struct bch_sb_field_journal { - struct bch_sb_field field; - __le64 buckets[0]; -}; - -struct bch_sb_field_members { - struct bch_sb_field field; - struct bch_member members[0]; -}; - -/* Crypto: */ - -struct nonce { - __le32 d[4]; -}; - -struct bch_key { - __le64 key[4]; -}; - -#define BCH_KEY_MAGIC \ - (((u64) 'b' << 0)|((u64) 'c' << 8)| \ - ((u64) 'h' << 16)|((u64) '*' << 24)| \ - ((u64) '*' << 32)|((u64) 'k' << 40)| \ - ((u64) 'e' << 48)|((u64) 'y' << 56)) - -struct bch_encrypted_key { - __le64 magic; - struct bch_key key; -}; - -/* - * If this field is present in the superblock, it stores an encryption key which - * is used encrypt all other data/metadata. The key will normally be encrypted - * with the key userspace provides, but if encryption has been turned off we'll - * just store the master key unencrypted in the superblock so we can access the - * previously encrypted data. - */ -struct bch_sb_field_crypt { - struct bch_sb_field field; - - __le64 flags; - __le64 kdf_flags; - struct bch_encrypted_key key; -}; - -LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); - -enum bch_kdf_types { - BCH_KDF_SCRYPT = 0, - BCH_KDF_NR = 1, -}; - -/* stored as base 2 log of scrypt params: */ -LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); - -struct bch_sb_field_replication { - struct bch_sb_field field; -}; - -/* - * @offset - sector where this sb was written - * @version - on disk format version - * @magic - identifies as a bcache superblock (BCACHE_MAGIC) - * @seq - incremented each time superblock is written - * @uuid - used for generating various magic numbers and identifying - * member devices, never changes - * @user_uuid - user visible UUID, may be changed - * @label - filesystem label - * @seq - identifies most recent superblock, incremented each time - * superblock is written - * @features - enabled incompatible features - */ -struct bch_sb { - struct bch_csum csum; - __le64 version; - uuid_le magic; - uuid_le uuid; - uuid_le user_uuid; - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 offset; - __le64 seq; - - __le16 block_size; - __u8 dev_idx; - __u8 nr_devices; - __le32 u64s; - - __le64 time_base_lo; - __le32 time_base_hi; - __le32 time_precision; - - __le64 flags[8]; - __le64 features[2]; - __le64 compat[2]; - - struct bch_sb_layout layout; - - union { - struct bch_sb_field start[0]; - __le64 _data[0]; - }; -} __attribute__((packed, aligned(8))); - -/* - * Flags: - * BCH_SB_INITALIZED - set on first mount - * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect - * behaviour of mount/recovery path: - * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits - * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 - * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides - * DATA/META_CSUM_TYPE. Also indicates encryption - * algorithm in use, if/when we get more than one - */ - -LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); - -LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); - -LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); - -LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); - -LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); - -LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64); - -LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); -LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); - -LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE, struct bch_sb, flags[1], 14, 20); - -LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); - -/* Features: */ -enum bch_sb_features { - BCH_FEATURE_LZ4 = 0, - BCH_FEATURE_GZIP = 1, -}; - -/* options: */ - -#define BCH_REPLICAS_MAX 4U - -#if 0 -#define BCH_ERROR_ACTIONS() \ - x(BCH_ON_ERROR_CONTINUE, 0, "continue") \ - x(BCH_ON_ERROR_RO, 1, "remount-ro") \ - x(BCH_ON_ERROR_PANIC, 2, "panic") \ - x(BCH_NR_ERROR_ACTIONS, 3, NULL) - -enum bch_error_actions { -#define x(_opt, _nr, _str) _opt = _nr, - BCH_ERROR_ACTIONS() -#undef x -}; -#endif - -enum bch_error_actions { - BCH_ON_ERROR_CONTINUE = 0, - BCH_ON_ERROR_RO = 1, - BCH_ON_ERROR_PANIC = 2, - BCH_NR_ERROR_ACTIONS = 3, -}; - -enum bch_csum_opts { - BCH_CSUM_OPT_NONE = 0, - BCH_CSUM_OPT_CRC32C = 1, - BCH_CSUM_OPT_CRC64 = 2, - BCH_CSUM_OPT_NR = 3, -}; - -enum bch_str_hash_opts { - BCH_STR_HASH_CRC32C = 0, - BCH_STR_HASH_CRC64 = 1, - BCH_STR_HASH_SIPHASH = 2, - BCH_STR_HASH_NR = 3, -}; - -enum bch_compression_opts { - BCH_COMPRESSION_NONE = 0, - BCH_COMPRESSION_LZ4 = 1, - BCH_COMPRESSION_GZIP = 2, - BCH_COMPRESSION_NR = 3, -}; - -/* backing device specific stuff: */ - -struct backingdev_sb { - __le64 csum; - __le64 offset; /* sector where this sb was written */ - __le64 version; /* of on disk format */ - - uuid_le magic; /* bcache superblock UUID */ - - uuid_le disk_uuid; - - /* - * Internal cache set UUID - xored with various magic numbers and thus - * must never change: - */ - union { - uuid_le set_uuid; - __le64 set_magic; - }; - __u8 label[BCH_SB_LABEL_SIZE]; - - __le64 flags; - - /* Incremented each time superblock is written: */ - __le64 seq; - - /* - * User visible UUID for identifying the cache set the user is allowed - * to change: - * - * XXX hooked up? - */ - uuid_le user_uuid; - __le64 pad1[6]; - - __le64 data_offset; - __le16 block_size; /* sectors */ - __le16 pad2[3]; - - __le32 last_mount; /* time_t */ - __le16 pad3; - /* size of variable length portion - always 0 for backingdev superblock */ - __le16 u64s; - __u64 _data[0]; -}; - -LE64_BITMASK(BDEV_CACHE_MODE, struct backingdev_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U - -LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -static inline _Bool __SB_IS_BDEV(__u64 version) -{ - return version == BCACHE_SB_VERSION_BDEV - || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - -static inline _Bool SB_IS_BDEV(const struct bch_sb *sb) -{ - return __SB_IS_BDEV(sb->version); -} - -/* - * Magic numbers - * - * The various other data structures have their own magic numbers, which are - * xored with the first part of the cache set's UUID - */ - -#define BCACHE_MAGIC \ - UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ - 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) - -#define BCACHE_STATFS_MAGIC 0xca451a4e - -#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL) -#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) - -static inline __le64 __bch_sb_magic(struct bch_sb *sb) -{ - __le64 ret; - memcpy(&ret, &sb->uuid, sizeof(ret)); - return ret; -} - -static inline __u64 __jset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch_sb_magic(sb) ^ JSET_MAGIC); -} - -static inline __u64 __pset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch_sb_magic(sb) ^ PSET_MAGIC); -} - -static inline __u64 __bset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch_sb_magic(sb) ^ BSET_MAGIC); -} - -/* Journal */ - -#define BCACHE_JSET_VERSION_UUIDv1 1 -#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ -#define BCACHE_JSET_VERSION_JKEYS 2 -#define BCACHE_JSET_VERSION 2 - -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __le32 flags; /* designates what this jset holds */ - - union { - struct bkey_i start[0]; - __u64 _data[0]; - }; -}; - -#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) - -LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8); -enum { - JOURNAL_ENTRY_BTREE_KEYS = 0, - JOURNAL_ENTRY_BTREE_ROOT = 1, - JOURNAL_ENTRY_PRIO_PTRS = 2, - - /* - * Journal sequence numbers can be blacklisted: bsets record the max - * sequence number of all the journal entries they contain updates for, - * so that on recovery we can ignore those bsets that contain index - * updates newer that what made it into the journal. - * - * This means that we can't reuse that journal_seq - we have to skip it, - * and then record that we skipped it so that the next time we crash and - * recover we don't think there was a missing journal entry. - */ - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3, -}; - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - struct bch_csum csum; - - __le64 magic; - __le64 seq; - __le32 version; - __le32 flags; - - __le32 u64s; /* size of d[] in u64s */ - - __u8 encrypted_start[0]; - - __le16 read_clock; - __le16 write_clock; - - /* Sequence number of oldest dirty journal entry */ - __le64 last_seq; - - - union { - struct jset_entry start[0]; - __u64 _data[0]; - }; -} __attribute__((packed)); - -LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); - -#define BCH_JOURNAL_BUCKETS_MIN 20 - -/* Bucket prios/gens */ - -struct prio_set { - struct bch_csum csum; - - __le64 magic; - __le32 nonce[3]; - __le16 version; - __le16 flags; - - __u8 encrypted_start[0]; - - __le64 next_bucket; - - struct bucket_disk { - __le16 read_prio; - __le16 write_prio; - __u8 gen; - } __attribute__((packed)) data[]; -} __attribute__((packed)); - -LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4); - -/* Btree: */ - -#define DEFINE_BCH_BTREE_IDS() \ - DEF_BTREE_ID(EXTENTS, 0, "extents") \ - DEF_BTREE_ID(INODES, 1, "inodes") \ - DEF_BTREE_ID(DIRENTS, 2, "dirents") \ - DEF_BTREE_ID(XATTRS, 3, "xattrs") - -#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, - -enum btree_id { - DEFINE_BCH_BTREE_IDS() - BTREE_ID_NR -}; - -#undef DEF_BTREE_ID - -#define BTREE_MAX_DEPTH 4U - -/* Btree nodes */ - -/* Version 1: Seed pointer into btree node checksum - */ -#define BCACHE_BSET_CSUM 1 -#define BCACHE_BSET_KEY_v1 2 -#define BCACHE_BSET_JOURNAL_SEQ 3 -#define BCACHE_BSET_VERSION 3 - -/* - * Btree nodes - * - * On disk a btree node is a list/log of these; within each set the keys are - * sorted - */ -struct bset { - __le64 seq; - - /* - * Highest journal entry this bset contains keys for. - * If on recovery we don't see that journal entry, this bset is ignored: - * this allows us to preserve the order of all index updates after a - * crash, since the journal records a total order of all index updates - * and anything that didn't make it to the journal doesn't get used. - */ - __le64 journal_seq; - - __le32 flags; - __le16 version; - __le16 u64s; /* count of d[] in u64s */ - - union { - struct bkey_packed start[0]; - __u64 _data[0]; - }; -} __attribute__((packed)); - -LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); - -LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, - struct bset, flags, 5, 6); - -struct btree_node { - struct bch_csum csum; - __le64 magic; - - /* this flags field is encrypted, unlike bset->flags: */ - __le64 flags; - - /* Closed interval: */ - struct bpos min_key; - struct bpos max_key; - struct bch_extent_ptr ptr; - struct bkey_format format; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - - }; - }; -} __attribute__((packed)); - -LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); -LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); - -struct btree_node_entry { - struct bch_csum csum; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - - }; - }; -} __attribute__((packed)); - -/* OBSOLETE */ - -#define BITMASK(name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ -static const __u64 name##_MAX = (1ULL << (end - offset)) - 1; \ - \ -static inline __u64 name(const type *k) \ -{ return (k->field >> offset) & ~(~0ULL << (end - offset)); } \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - k->field &= ~(~(~0ULL << (end - offset)) << offset); \ - k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ -} - -struct bkey_v0 { - __u64 high; - __u64 low; - __u64 ptr[]; -}; - -#define KEY0_FIELD(name, field, offset, size) \ - BITMASK(name, struct bkey_v0, field, offset, size) - -KEY0_FIELD(KEY0_PTRS, high, 60, 63) -KEY0_FIELD(KEY0_CSUM, high, 56, 58) -KEY0_FIELD(KEY0_DIRTY, high, 36, 37) - -KEY0_FIELD(KEY0_SIZE, high, 20, 36) -KEY0_FIELD(KEY0_INODE, high, 0, 20) - -static inline unsigned long bkey_v0_u64s(const struct bkey_v0 *k) -{ - return (sizeof(struct bkey_v0) / sizeof(__u64)) + KEY0_PTRS(k); -} - -static inline struct bkey_v0 *bkey_v0_next(const struct bkey_v0 *k) -{ - __u64 *d = (__u64 *) k; - - return (struct bkey_v0 *) (d + bkey_v0_u64s(k)); -} - -struct jset_v0 { - __u64 csum; - __u64 magic; - __u64 seq; - __u32 version; - __u32 keys; - - __u64 last_seq; - - __BKEY_PADDED(uuid_bucket, 4); - __BKEY_PADDED(btree_root, 4); - __u16 btree_level; - __u16 pad[3]; - - __u64 prio_bucket[64]; - - union { - struct bkey start[0]; - __u64 d[0]; - }; -}; - -/* UUIDS - per backing device/flash only volume metadata */ - -struct uuid_entry_v0 { - uuid_le uuid; - __u8 label[32]; - __u32 first_reg; - __u32 last_reg; - __u32 invalidated; - __u32 pad; -}; - -struct uuid_entry { - union { - struct { - uuid_le uuid; - __u8 label[32]; - __u32 first_reg; - __u32 last_reg; - __u32 invalidated; - - __u32 flags; - /* Size of flash only volumes */ - __u64 sectors; - }; - - __u8 pad[128]; - }; -}; - -BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); - -#ifdef __cplusplus -} -#endif -#endif /* _LINUX_BCACHE_H */ - -/* vim: set foldnestmax=2: */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 37a04a32..1c793b51 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -85,6 +85,12 @@ struct block_device { void generic_make_request(struct bio *); int submit_bio_wait(struct bio *); + +static inline void submit_bio(struct bio *bio) +{ + generic_make_request(bio); +} + int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t, unsigned long); diff --git a/include/linux/closure.h b/include/linux/closure.h new file mode 100644 index 00000000..33280d30 --- /dev/null +++ b/include/linux/closure.h @@ -0,0 +1,385 @@ +#ifndef _LINUX_CLOSURE_H +#define _LINUX_CLOSURE_H + +#include <linux/llist.h> +#include <linux/sched.h> +#include <linux/workqueue.h> + +/* + * Closure is perhaps the most overused and abused term in computer science, but + * since I've been unable to come up with anything better you're stuck with it + * again. + * + * What are closures? + * + * They embed a refcount. The basic idea is they count "things that are in + * progress" - in flight bios, some other thread that's doing something else - + * anything you might want to wait on. + * + * The refcount may be manipulated with closure_get() and closure_put(). + * closure_put() is where many of the interesting things happen, when it causes + * the refcount to go to 0. + * + * Closures can be used to wait on things both synchronously and asynchronously, + * and synchronous and asynchronous use can be mixed without restriction. To + * wait synchronously, use closure_sync() - you will sleep until your closure's + * refcount hits 1. + * + * To wait asynchronously, use + * continue_at(cl, next_function, workqueue); + * + * passing it, as you might expect, the function to run when nothing is pending + * and the workqueue to run that function out of. + * + * continue_at() also, critically, requires a 'return' immediately following the + * location where this macro is referenced, to return to the calling function. + * There's good reason for this. + * + * To use safely closures asynchronously, they must always have a refcount while + * they are running owned by the thread that is running them. Otherwise, suppose + * you submit some bios and wish to have a function run when they all complete: + * + * foo_endio(struct bio *bio) + * { + * closure_put(cl); + * } + * + * closure_init(cl); + * + * do_stuff(); + * closure_get(cl); + * bio1->bi_endio = foo_endio; + * bio_submit(bio1); + * + * do_more_stuff(); + * closure_get(cl); + * bio2->bi_endio = foo_endio; + * bio_submit(bio2); + * + * continue_at(cl, complete_some_read, system_wq); + * + * If closure's refcount started at 0, complete_some_read() could run before the + * second bio was submitted - which is almost always not what you want! More + * importantly, it wouldn't be possible to say whether the original thread or + * complete_some_read()'s thread owned the closure - and whatever state it was + * associated with! + * + * So, closure_init() initializes a closure's refcount to 1 - and when a + * closure_fn is run, the refcount will be reset to 1 first. + * + * Then, the rule is - if you got the refcount with closure_get(), release it + * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount + * on a closure because you called closure_init() or you were run out of a + * closure - _always_ use continue_at(). Doing so consistently will help + * eliminate an entire class of particularly pernicious races. + * + * Lastly, you might have a wait list dedicated to a specific event, and have no + * need for specifying the condition - you just want to wait until someone runs + * closure_wake_up() on the appropriate wait list. In that case, just use + * closure_wait(). It will return either true or false, depending on whether the + * closure was already on a wait list or not - a closure can only be on one wait + * list at a time. + * + * Parents: + * + * closure_init() takes two arguments - it takes the closure to initialize, and + * a (possibly null) parent. + * + * If parent is non null, the new closure will have a refcount for its lifetime; + * a closure is considered to be "finished" when its refcount hits 0 and the + * function to run is null. Hence + * + * continue_at(cl, NULL, NULL); + * + * returns up the (spaghetti) stack of closures, precisely like normal return + * returns up the C stack. continue_at() with non null fn is better thought of + * as doing a tail call. + * + * All this implies that a closure should typically be embedded in a particular + * struct (which its refcount will normally control the lifetime of), and that + * struct can very much be thought of as a stack frame. + */ + +struct closure; +struct closure_syncer; +typedef void (closure_fn) (struct closure *); + +struct closure_waitlist { + struct llist_head list; +}; + +enum closure_state { + /* + * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by + * the thread that owns the closure, and cleared by the thread that's + * waking up the closure. + * + * The rest are for debugging and don't affect behaviour: + * + * CLOSURE_RUNNING: Set when a closure is running (i.e. by + * closure_init() and when closure_put() runs then next function), and + * must be cleared before remaining hits 0. Primarily to help guard + * against incorrect usage and accidentally transferring references. + * continue_at() and closure_return() clear it for you, if you're doing + * something unusual you can use closure_set_dead() which also helps + * annotate where references are being transferred. + */ + + CLOSURE_BITS_START = (1U << 27), + CLOSURE_DESTRUCTOR = (1U << 27), + CLOSURE_WAITING = (1U << 29), + CLOSURE_RUNNING = (1U << 31), +}; + +#define CLOSURE_GUARD_MASK \ + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) + +#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) + +struct closure { + union { + struct { + struct workqueue_struct *wq; + struct closure_syncer *s; + struct llist_node list; + closure_fn *fn; + }; + struct work_struct work; + }; + + struct closure *parent; + + atomic_t remaining; + +#ifdef CONFIG_DEBUG_CLOSURES +#define CLOSURE_MAGIC_DEAD 0xc054dead +#define CLOSURE_MAGIC_ALIVE 0xc054a11e + + unsigned magic; + struct list_head all; + unsigned long ip; + unsigned long waiting_on; +#endif +}; + +void closure_sub(struct closure *cl, int v); +void closure_put(struct closure *cl); +void __closure_wake_up(struct closure_waitlist *list); +bool closure_wait(struct closure_waitlist *list, struct closure *cl); +void __closure_sync(struct closure *cl); + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +static inline void closure_sync(struct closure *cl) +{ + if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) + __closure_sync(cl); +} + +#ifdef CONFIG_DEBUG_CLOSURES + +void closure_debug_create(struct closure *cl); +void closure_debug_destroy(struct closure *cl); + +#else + +static inline void closure_debug_create(struct closure *cl) {} +static inline void closure_debug_destroy(struct closure *cl) {} + +#endif + +static inline void closure_set_ip(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _THIS_IP_; +#endif +} + +static inline void closure_set_ret_ip(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _RET_IP_; +#endif +} + +static inline void closure_set_waiting(struct closure *cl, unsigned long f) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->waiting_on = f; +#endif +} + +static inline void closure_set_stopped(struct closure *cl) +{ + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} + +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic(); +} + +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + queue_work(wq, &cl->work); + } else + cl->fn(cl); +} + +/** + * closure_get - increment a closure's refcount + */ +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} + +/** + * closure_init - Initialize a closure, setting the refcount to 1 + * @cl: closure to initialize + * @parent: parent of the new closure. cl will take a refcount on it for its + * lifetime; may be NULL. + */ +static inline void closure_init(struct closure *cl, struct closure *parent) +{ + cl->fn = NULL; + cl->parent = parent; + if (parent) + closure_get(parent); + + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + + closure_debug_create(cl); + closure_set_ip(cl); +} + +static inline void closure_init_stack(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +} + +/** + * closure_wake_up - wake up all closures on a wait list. + */ +static inline void closure_wake_up(struct closure_waitlist *list) +{ + smp_mb(); + __closure_wake_up(list); +} + +#define continue_at_noreturn(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_sub(_cl, CLOSURE_RUNNING + 1); \ +} while (0) + +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * NOTE: This macro expands to a return in the calling function! + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. + */ +#define continue_at(_cl, _fn, _wq) \ +do { \ + continue_at_noreturn(_cl, _fn, _wq); \ + return; \ +} while (0) + +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ +#define closure_return(_cl) continue_at((_cl), NULL, NULL) + +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * NOTE: like continue_at(), this macro expands to a return in the caller! + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ +#define continue_at_nobarrier(_cl, _fn, _wq) \ +do { \ + closure_set_ip(_cl); \ + if (_wq) { \ + INIT_WORK(&(_cl)->work, (void *) _fn); \ + queue_work((_wq), &(_cl)->work); \ + } else { \ + (_fn)(_cl); \ + } \ + return; \ +} while (0) + +#define closure_return_with_destructor_noreturn(_cl, _destructor) \ +do { \ + set_closure_fn(_cl, _destructor, NULL); \ + closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +} while (0) + +/** + * closure_return - finish execution of a closure, with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ +#define closure_return_with_destructor(_cl, _destructor) \ +do { \ + closure_return_with_destructor_noreturn(_cl, _destructor); \ + return; \ +} while (0) + +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ +static inline void closure_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + closure_init(cl, parent); + continue_at_nobarrier(cl, fn, wq); +} + +#endif /* _LINUX_CLOSURE_H */ diff --git a/include/trace/events/bcache.h b/include/trace/events/bcachefs.h index b39fdde7..7dea9d63 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcachefs.h @@ -1,52 +1,11 @@ #undef TRACE_SYSTEM -#define TRACE_SYSTEM bcache +#define TRACE_SYSTEM bcachefs #if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BCACHE_H #include <linux/tracepoint.h> -struct bcache_device; -struct bio; -struct bkey; -struct btree; -struct bch_dev; -struct bch_fs; -struct keylist; -struct moving_queue; - -DECLARE_EVENT_CLASS(bcache_request, - TP_PROTO(struct bcache_device *d, struct bio *bio), - TP_ARGS(d, bio), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned int, orig_major ) - __field(unsigned int, orig_minor ) - __field(sector_t, sector ) - __field(sector_t, orig_sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - ), - - TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; - __entry->orig_major = d->disk->major; - __entry->orig_minor = d->disk->first_minor; - __entry->sector = bio->bi_iter.bi_sector; - __entry->orig_sector = bio->bi_iter.bi_sector - 16; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); - ), - - TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->orig_major, __entry->orig_minor, - (unsigned long long)__entry->orig_sector) -); - DECLARE_EVENT_CLASS(bpos, TP_PROTO(struct bpos p), TP_ARGS(p), @@ -84,73 +43,47 @@ DECLARE_EVENT_CLASS(bkey, __entry->offset, __entry->size) ); -/* request.c */ - -DEFINE_EVENT(bcache_request, bcache_request_start, - TP_PROTO(struct bcache_device *d, struct bio *bio), - TP_ARGS(d, bio) -); - -DEFINE_EVENT(bcache_request, bcache_request_end, - TP_PROTO(struct bcache_device *d, struct bio *bio), - TP_ARGS(d, bio) -); - -DECLARE_EVENT_CLASS(bcache_bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), +DECLARE_EVENT_CLASS(bch_dev, + TP_PROTO(struct bch_dev *ca), + TP_ARGS(ca), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(sector_t, sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) + __array(char, uuid, 16 ) + __field(unsigned, tier ) ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + memcpy(__entry->uuid, ca->uuid.b, 16); + __entry->tier = ca->mi.tier; ), - TP_printk("%d,%d %s %llu + %u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, __entry->nr_sector) + TP_printk("%pU tier %u", __entry->uuid, __entry->tier) ); -DEFINE_EVENT(bcache_bio, bcache_bypass_sequential, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); +DECLARE_EVENT_CLASS(bch_fs, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c), -DEFINE_EVENT(bcache_bio, bcache_bypass_congested, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); + TP_STRUCT__entry( + __array(char, uuid, 16 ) + ), -DEFINE_EVENT(bcache_bio, bcache_promote, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + ), -DEFINE_EVENT(bkey, bcache_promote_collision, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) + TP_printk("%pU", __entry->uuid) ); -TRACE_EVENT(bcache_read, - TP_PROTO(struct bio *bio, bool hit, bool bypass), - TP_ARGS(bio, hit, bypass), +DECLARE_EVENT_CLASS(bio, + TP_PROTO(struct bio *bio), + TP_ARGS(bio), TP_STRUCT__entry( __field(dev_t, dev ) __field(sector_t, sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __field(bool, cache_hit ) - __field(bool, bypass ) ), TP_fast_assign( @@ -159,49 +92,53 @@ TRACE_EVENT(bcache_read, __entry->nr_sector = bio->bi_iter.bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); - __entry->cache_hit = hit; - __entry->bypass = bypass; ), - TP_printk("%d,%d %s %llu + %u hit %u bypass %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->cache_hit, __entry->bypass) + TP_printk("%d,%d %s %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector) ); -TRACE_EVENT(bcache_write, - TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, - bool writeback, bool bypass), - TP_ARGS(c, inode, bio, writeback, bypass), +DECLARE_EVENT_CLASS(page_alloc_fail, + TP_PROTO(struct bch_fs *c, u64 size), + TP_ARGS(c, size), TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, inode ) - __field(sector_t, sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - __field(bool, writeback ) - __field(bool, bypass ) + __array(char, uuid, 16 ) + __field(u64, size ) ), TP_fast_assign( memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->inode = inode; - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); - __entry->writeback = writeback; - __entry->bypass = bypass; + __entry->size = size; ), - TP_printk("%pU inode %llu %s %llu + %u hit %u bypass %u", - __entry->uuid, __entry->inode, - __entry->rwbs, (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->writeback, __entry->bypass) + TP_printk("%pU size %llu", __entry->uuid, __entry->size) +); + +/* io.c: */ + +DEFINE_EVENT(bio, read_split, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); -TRACE_EVENT(bcache_write_throttle, +DEFINE_EVENT(bio, read_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_retry, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, promote, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +TRACE_EVENT(write_throttle, TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay), TP_ARGS(c, inode, bio, delay), @@ -230,172 +167,24 @@ TRACE_EVENT(bcache_write_throttle, __entry->nr_sector, __entry->delay) ); -DEFINE_EVENT(bcache_bio, bcache_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DECLARE_EVENT_CLASS(page_alloc_fail, - TP_PROTO(struct bch_fs *c, u64 size), - TP_ARGS(c, size), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, size ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->size = size; - ), - - TP_printk("%pU size %llu", __entry->uuid, __entry->size) -); - /* Journal */ -DECLARE_EVENT_CLASS(cache_set, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - ), - - TP_printk("%pU", __entry->uuid) -); - -DEFINE_EVENT(bkey, bcache_journal_replay_key, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) -); - -TRACE_EVENT(bcache_journal_next_bucket, - TP_PROTO(struct bch_dev *ca, unsigned cur_idx, unsigned last_idx), - TP_ARGS(ca, cur_idx, last_idx), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(unsigned, cur_idx ) - __field(unsigned, last_idx ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - __entry->cur_idx = cur_idx; - __entry->last_idx = last_idx; - ), - - TP_printk("%pU cur %u last %u", __entry->uuid, - __entry->cur_idx, __entry->last_idx) -); - -TRACE_EVENT(bcache_journal_write_oldest, - TP_PROTO(struct bch_fs *c, u64 seq), - TP_ARGS(c, seq), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, seq ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->seq = seq; - ), - - TP_printk("%pU seq %llu", __entry->uuid, __entry->seq) -); - -TRACE_EVENT(bcache_journal_write_oldest_done, - TP_PROTO(struct bch_fs *c, u64 seq, unsigned written), - TP_ARGS(c, seq, written), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, seq ) - __field(unsigned, written ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->seq = seq; - __entry->written = written; - ), - - TP_printk("%pU seq %llu written %u", __entry->uuid, __entry->seq, - __entry->written) -); - -DEFINE_EVENT(cache_set, bcache_journal_full, +DEFINE_EVENT(bch_fs, journal_full, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache_set, bcache_journal_entry_full, +DEFINE_EVENT(bch_fs, journal_entry_full, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(bcache_bio, bcache_journal_write, +DEFINE_EVENT(bio, journal_write, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -/* Device state changes */ - -DEFINE_EVENT(cache_set, fs_read_only, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(cache_set, fs_read_only_done, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DECLARE_EVENT_CLASS(cache, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(unsigned, tier ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - __entry->tier = ca->mi.tier; - ), - - TP_printk("%pU tier %u", __entry->uuid, __entry->tier) -); - -DEFINE_EVENT(cache, bcache_cache_read_only, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(cache, bcache_cache_read_only_done, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(cache, bcache_cache_read_write, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(cache, bcache_cache_read_write_done, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -/* Searching */ +/* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, TP_PROTO(struct bpos p), @@ -431,12 +220,12 @@ DECLARE_EVENT_CLASS(btree_node, __entry->inode, __entry->offset) ); -DEFINE_EVENT(btree_node, bcache_btree_read, +DEFINE_EVENT(btree_node, btree_read, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(bcache_btree_write, +TRACE_EVENT(btree_write, TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), TP_ARGS(b, bytes, sectors), @@ -456,34 +245,17 @@ TRACE_EVENT(bcache_btree_write, __entry->type , __entry->bytes, __entry->sectors) ); -DEFINE_EVENT(btree_node, bcache_btree_node_alloc, +DEFINE_EVENT(btree_node, btree_node_alloc, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(bcache_btree_node_alloc_fail, - TP_PROTO(struct bch_fs *c, enum btree_id id), - TP_ARGS(c, id), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(enum btree_id, id ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->id = id; - ), - - TP_printk("%pU id %u", __entry->uuid, __entry->id) -); - -DEFINE_EVENT(btree_node, bcache_btree_node_free, +DEFINE_EVENT(btree_node, btree_node_free, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(bcache_mca_reap, +TRACE_EVENT(btree_node_reap, TP_PROTO(struct bch_fs *c, struct btree *b, int ret), TP_ARGS(c, b, ret), @@ -500,33 +272,7 @@ TRACE_EVENT(bcache_mca_reap, TP_printk("bucket %llu ret %d", __entry->bucket, __entry->ret) ); -TRACE_EVENT(bcache_mca_scan, - TP_PROTO(struct bch_fs *c, unsigned touched, unsigned freed, - unsigned can_free, unsigned long nr), - TP_ARGS(c, touched, freed, can_free, nr), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(unsigned long, touched ) - __field(unsigned long, freed ) - __field(unsigned long, can_free ) - __field(unsigned long, nr ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->touched = touched; - __entry->freed = freed; - __entry->can_free = can_free; - __entry->nr = nr; - ), - - TP_printk("%pU touched %lu freed %lu can_free %lu nr %lu", - __entry->uuid, __entry->touched, __entry->freed, - __entry->can_free, __entry->nr) -); - -DECLARE_EVENT_CLASS(mca_cannibalize_lock, +DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -541,27 +287,47 @@ DECLARE_EVENT_CLASS(mca_cannibalize_lock, TP_printk("%pU", __entry->uuid) ); -DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock_fail, +DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize_lock, +DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(mca_cannibalize_lock, bcache_mca_cannibalize, +DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache_set, bcache_mca_cannibalize_unlock, +DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -TRACE_EVENT(bcache_btree_insert_key, +TRACE_EVENT(btree_reserve_get_fail, + TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), + TP_ARGS(c, required, cl), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(size_t, required ) + __field(struct closure *, cl ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->required = required; + __entry->cl = cl; + ), + + TP_printk("%pU required %zu by %p", __entry->uuid, + __entry->required, __entry->cl) +); + +TRACE_EVENT(btree_insert_key, TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), TP_ARGS(c, b, k), @@ -620,24 +386,24 @@ DECLARE_EVENT_CLASS(btree_split, __entry->inode, __entry->offset, __entry->keys) ); -DEFINE_EVENT(btree_split, bcache_btree_node_split, +DEFINE_EVENT(btree_split, btree_node_split, TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys), TP_ARGS(c, b, keys) ); -DEFINE_EVENT(btree_split, bcache_btree_node_compact, +DEFINE_EVENT(btree_split, btree_node_compact, TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys), TP_ARGS(c, b, keys) ); -DEFINE_EVENT(btree_node, bcache_btree_set_root, +DEFINE_EVENT(btree_node, btree_set_root, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); /* Garbage collection */ -TRACE_EVENT(bcache_btree_gc_coalesce, +TRACE_EVENT(btree_gc_coalesce, TP_PROTO(struct bch_fs *c, struct btree *b, unsigned nodes), TP_ARGS(c, b, nodes), @@ -664,7 +430,7 @@ TRACE_EVENT(bcache_btree_gc_coalesce, __entry->inode, __entry->offset, __entry->nodes) ); -TRACE_EVENT(bcache_btree_gc_coalesce_fail, +TRACE_EVENT(btree_gc_coalesce_fail, TP_PROTO(struct bch_fs *c, int reason), TP_ARGS(c, reason), @@ -681,119 +447,54 @@ TRACE_EVENT(bcache_btree_gc_coalesce_fail, TP_printk("%pU: %u", __entry->uuid, __entry->reason) ); -TRACE_EVENT(bcache_btree_node_alloc_replacement, - TP_PROTO(struct bch_fs *c, struct btree *old, struct btree *b), - TP_ARGS(c, old, b), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, bucket ) - __field(u64, old_bucket ) - __field(u8, level ) - __field(u8, id ) - __field(u32, inode ) - __field(u64, offset ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->old_bucket = PTR_BUCKET_NR_TRACE(c, - &old->key, 0); - __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); - __entry->level = b->level; - __entry->id = b->btree_id; - __entry->inode = b->key.k.p.inode; - __entry->offset = b->key.k.p.offset; - ), - - TP_printk("%pU for %llu bucket %llu(%u) id %u: %u:%llu", - __entry->uuid, __entry->old_bucket, __entry->bucket, - __entry->level, __entry->id, - __entry->inode, __entry->offset) -); - -DEFINE_EVENT(btree_node, bcache_btree_gc_rewrite_node, +DEFINE_EVENT(btree_node, btree_gc_rewrite_node, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, bcache_btree_gc_rewrite_node_fail, +DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(cache_set, bcache_gc_start, +DEFINE_EVENT(bch_fs, gc_start, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache_set, bcache_gc_end, +DEFINE_EVENT(bch_fs, gc_end, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache_set, bcache_gc_coalesce_start, +DEFINE_EVENT(bch_fs, gc_coalesce_start, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache_set, bcache_gc_coalesce_end, +DEFINE_EVENT(bch_fs, gc_coalesce_end, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache, bcache_sectors_saturated, +DEFINE_EVENT(bch_dev, sectors_saturated, TP_PROTO(struct bch_dev *ca), TP_ARGS(ca) ); -DEFINE_EVENT(cache_set, bcache_gc_sectors_saturated, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(cache_set, bcache_gc_cannot_inc_gens, +DEFINE_EVENT(bch_fs, gc_sectors_saturated, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(cache_set, bcache_gc_periodic, +DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -TRACE_EVENT(bcache_mark_bucket, - TP_PROTO(struct bch_dev *ca, const struct bkey *k, - const struct bch_extent_ptr *ptr, - int sectors, bool dirty), - TP_ARGS(ca, k, ptr, sectors, dirty), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u32, inode ) - __field(u64, offset ) - __field(u32, sectors ) - __field(u64, bucket ) - __field(bool, dirty ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - __entry->inode = k->p.inode; - __entry->offset = k->p.offset; - __entry->sectors = sectors; - __entry->bucket = PTR_BUCKET_NR(ca, ptr); - __entry->dirty = dirty; - ), - - TP_printk("%pU %u:%llu sectors %i bucket %llu dirty %i", - __entry->uuid, __entry->inode, __entry->offset, - __entry->sectors, __entry->bucket, __entry->dirty) -); - /* Allocator */ -TRACE_EVENT(bcache_alloc_batch, +TRACE_EVENT(alloc_batch, TP_PROTO(struct bch_dev *ca, size_t free, size_t total), TP_ARGS(ca, free, total), @@ -813,37 +514,17 @@ TRACE_EVENT(bcache_alloc_batch, __entry->uuid, __entry->free, __entry->total) ); -TRACE_EVENT(bcache_btree_reserve_get_fail, - TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), - TP_ARGS(c, required, cl), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(size_t, required ) - __field(struct closure *, cl ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->required = required; - __entry->cl = cl; - ), - - TP_printk("%pU required %zu by %p", __entry->uuid, - __entry->required, __entry->cl) -); - -DEFINE_EVENT(cache, bcache_prio_write_start, +DEFINE_EVENT(bch_dev, prio_write_start, TP_PROTO(struct bch_dev *ca), TP_ARGS(ca) ); -DEFINE_EVENT(cache, bcache_prio_write_end, +DEFINE_EVENT(bch_dev, prio_write_end, TP_PROTO(struct bch_dev *ca), TP_ARGS(ca) ); -TRACE_EVENT(bcache_invalidate, +TRACE_EVENT(invalidate, TP_PROTO(struct bch_dev *ca, size_t bucket, unsigned sectors), TP_ARGS(ca, bucket, sectors), @@ -864,12 +545,12 @@ TRACE_EVENT(bcache_invalidate, MINOR(__entry->dev), __entry->offset) ); -DEFINE_EVENT(cache_set, bcache_rescale_prios, +DEFINE_EVENT(bch_fs, rescale_prios, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DECLARE_EVENT_CLASS(cache_bucket_alloc, +DECLARE_EVENT_CLASS(bucket_alloc, TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), TP_ARGS(ca, reserve), @@ -886,17 +567,17 @@ DECLARE_EVENT_CLASS(cache_bucket_alloc, TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) ); -DEFINE_EVENT(cache_bucket_alloc, bcache_bucket_alloc, +DEFINE_EVENT(bucket_alloc, bucket_alloc, TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), TP_ARGS(ca, reserve) ); -DEFINE_EVENT(cache_bucket_alloc, bcache_bucket_alloc_fail, +DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), TP_ARGS(ca, reserve) ); -TRACE_EVENT(bcache_freelist_empty_fail, +TRACE_EVENT(freelist_empty_fail, TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve, struct closure *cl), TP_ARGS(c, reserve, cl), @@ -935,47 +616,16 @@ DECLARE_EVENT_CLASS(open_bucket_alloc, __entry->uuid, __entry->cl) ); -DEFINE_EVENT(open_bucket_alloc, bcache_open_bucket_alloc, +DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc, TP_PROTO(struct bch_fs *c, struct closure *cl), TP_ARGS(c, cl) ); -DEFINE_EVENT(open_bucket_alloc, bcache_open_bucket_alloc_fail, +DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail, TP_PROTO(struct bch_fs *c, struct closure *cl), TP_ARGS(c, cl) ); -/* Keylists */ - -TRACE_EVENT(bcache_keyscan, - TP_PROTO(unsigned nr_found, - unsigned start_inode, u64 start_offset, - unsigned end_inode, u64 end_offset), - TP_ARGS(nr_found, - start_inode, start_offset, - end_inode, end_offset), - - TP_STRUCT__entry( - __field(__u32, nr_found ) - __field(__u32, start_inode ) - __field(__u64, start_offset ) - __field(__u32, end_inode ) - __field(__u64, end_offset ) - ), - - TP_fast_assign( - __entry->nr_found = nr_found; - __entry->start_inode = start_inode; - __entry->start_offset = start_offset; - __entry->end_inode = end_inode; - __entry->end_offset = end_offset; - ), - - TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found, - __entry->start_inode, __entry->start_offset, - __entry->end_inode, __entry->end_offset) -); - /* Moving IO */ DECLARE_EVENT_CLASS(moving_io, @@ -998,44 +648,39 @@ DECLARE_EVENT_CLASS(moving_io, __entry->inode, __entry->offset, __entry->sectors) ); -DEFINE_EVENT(moving_io, bcache_move_read, +DEFINE_EVENT(moving_io, move_read, TP_PROTO(struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(moving_io, bcache_move_read_done, +DEFINE_EVENT(moving_io, move_read_done, TP_PROTO(struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(moving_io, bcache_move_write, +DEFINE_EVENT(moving_io, move_write, TP_PROTO(struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(moving_io, bcache_move_write_done, - TP_PROTO(struct bkey *k), - TP_ARGS(k) -); - -DEFINE_EVENT(moving_io, bcache_copy_collision, +DEFINE_EVENT(moving_io, copy_collision, TP_PROTO(struct bkey *k), TP_ARGS(k) ); /* Copy GC */ -DEFINE_EVENT(page_alloc_fail, bcache_moving_gc_alloc_fail, +DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail, TP_PROTO(struct bch_fs *c, u64 size), TP_ARGS(c, size) ); -DEFINE_EVENT(cache, bcache_moving_gc_start, +DEFINE_EVENT(bch_dev, moving_gc_start, TP_PROTO(struct bch_dev *ca), TP_ARGS(ca) ); -TRACE_EVENT(bcache_moving_gc_end, +TRACE_EVENT(moving_gc_end, TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved, u64 buckets_moved), TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved), @@ -1059,44 +704,24 @@ TRACE_EVENT(bcache_moving_gc_end, __entry->buckets_moved) ); -DEFINE_EVENT(cache, bcache_moving_gc_reserve_empty, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(cache, bcache_moving_gc_no_work, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(bkey, bcache_gc_copy, +DEFINE_EVENT(bkey, gc_copy, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); /* Tiering */ -DEFINE_EVENT(cache_set, bcache_tiering_refill_start, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(cache_set, bcache_tiering_refill_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(page_alloc_fail, bcache_tiering_alloc_fail, +DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail, TP_PROTO(struct bch_fs *c, u64 size), TP_ARGS(c, size) ); -DEFINE_EVENT(cache_set, bcache_tiering_start, +DEFINE_EVENT(bch_fs, tiering_start, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -TRACE_EVENT(bcache_tiering_end, +TRACE_EVENT(tiering_end, TP_PROTO(struct bch_fs *c, u64 sectors_moved, u64 keys_moved), TP_ARGS(c, sectors_moved, keys_moved), @@ -1117,54 +742,11 @@ TRACE_EVENT(bcache_tiering_end, __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ); -DEFINE_EVENT(bkey, bcache_tiering_copy, +DEFINE_EVENT(bkey, tiering_copy, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -/* Background writeback */ - -DEFINE_EVENT(bkey, bcache_writeback, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) -); - -DEFINE_EVENT(bkey, bcache_writeback_collision, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) -); - -TRACE_EVENT(bcache_writeback_error, - TP_PROTO(struct bkey *k, bool write, int error), - TP_ARGS(k, write, error), - - TP_STRUCT__entry( - __field(u32, size ) - __field(u32, inode ) - __field(u64, offset ) - __field(bool, write ) - __field(int, error ) - ), - - TP_fast_assign( - __entry->inode = k->p.inode; - __entry->offset = k->p.offset; - __entry->size = k->size; - __entry->write = write; - __entry->error = error; - ), - - TP_printk("%u:%llu len %u %s error %d", __entry->inode, - __entry->offset, __entry->size, - __entry->write ? "write" : "read", - __entry->error) -); - -DEFINE_EVENT(page_alloc_fail, bcache_writeback_alloc_fail, - TP_PROTO(struct bch_fs *c, u64 size), - TP_ARGS(c, size) -); - #endif /* _TRACE_BCACHE_H */ /* This part must be outside protection */ |