summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-05-17 01:38:57 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-17 02:36:19 -0400
commitff86d4722124c300c40b85b6eb8ef2d410ab303c (patch)
tree05e54b0bf6397ecbb5e7717a7925ac6ed2645a68
parent800408be11898f6d53ceecfd894cce8860fda26a (diff)
Update bcachefs sources to 0906b1fb49 bcachefs: fixes for 32 bit/big endian machines
-rw-r--r--.bcachefs_revision2
-rw-r--r--bcachefs.c4
-rw-r--r--cmd_assemble.c6
-rw-r--r--cmd_debug.c1
-rw-r--r--cmd_run.c2
-rw-r--r--cmds.h2
-rw-r--r--include/linux/timer.h24
-rw-r--r--include/linux/workqueue.h6
-rw-r--r--libbcachefs/alloc.c22
-rw-r--r--libbcachefs/alloc.h3
-rw-r--r--libbcachefs/bcachefs.h58
-rw-r--r--libbcachefs/bcachefs_format.h133
-rw-r--r--libbcachefs/bcachefs_ioctl.h152
-rw-r--r--libbcachefs/bkey.h24
-rw-r--r--libbcachefs/bset.c15
-rw-r--r--libbcachefs/bset.h14
-rw-r--r--libbcachefs/btree_cache.c3
-rw-r--r--libbcachefs/btree_gc.c26
-rw-r--r--libbcachefs/btree_io.c12
-rw-r--r--libbcachefs/btree_io.h2
-rw-r--r--libbcachefs/btree_iter.c38
-rw-r--r--libbcachefs/btree_iter.h66
-rw-r--r--libbcachefs/btree_locking.h45
-rw-r--r--libbcachefs/btree_types.h73
-rw-r--r--libbcachefs/btree_update_interior.c18
-rw-r--r--libbcachefs/btree_update_interior.h71
-rw-r--r--libbcachefs/btree_update_leaf.c2
-rw-r--r--libbcachefs/buckets.c13
-rw-r--r--libbcachefs/buckets.h10
-rw-r--r--libbcachefs/buckets_types.h2
-rw-r--r--libbcachefs/chardev.c27
-rw-r--r--libbcachefs/checksum.c2
-rw-r--r--libbcachefs/clock.c47
-rw-r--r--libbcachefs/clock.h3
-rw-r--r--libbcachefs/compress.c25
-rw-r--r--libbcachefs/disk_groups.c37
-rw-r--r--libbcachefs/disk_groups.h28
-rw-r--r--libbcachefs/extents.c16
-rw-r--r--libbcachefs/extents.h30
-rw-r--r--libbcachefs/fs-io.c13
-rw-r--r--libbcachefs/fsck.c9
-rw-r--r--libbcachefs/io.c21
-rw-r--r--libbcachefs/journal.h2
-rw-r--r--libbcachefs/journal_io.c47
-rw-r--r--libbcachefs/journal_seq_blacklist.c2
-rw-r--r--libbcachefs/keylist.h3
-rw-r--r--libbcachefs/move.c18
-rw-r--r--libbcachefs/move.h11
-rw-r--r--libbcachefs/move_types.h14
-rw-r--r--libbcachefs/movinggc.c6
-rw-r--r--libbcachefs/rebalance.c341
-rw-r--r--libbcachefs/rebalance.h (renamed from libbcachefs/tier.h)12
-rw-r--r--libbcachefs/rebalance_types.h26
-rw-r--r--libbcachefs/six.c11
-rw-r--r--libbcachefs/super-io.c2
-rw-r--r--libbcachefs/super-io.h8
-rw-r--r--libbcachefs/super.c30
-rw-r--r--libbcachefs/sysfs.c37
-rw-r--r--libbcachefs/tier.c259
-rw-r--r--libbcachefs/util.c31
-rw-r--r--libbcachefs/util.h14
-rw-r--r--libbcachefs/xattr.c2
-rw-r--r--linux/sched.c21
-rw-r--r--linux/timer.c2
-rw-r--r--linux/workqueue.c8
65 files changed, 1230 insertions, 784 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 37d51b2f..e267faa6 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
+0906b1fb492e8e84f563b192fd8f458af1c1d420
diff --git a/bcachefs.c b/bcachefs.c
index 53806f39..1c56ead7 100644
--- a/bcachefs.c
+++ b/bcachefs.c
@@ -36,10 +36,12 @@ static void usage(void)
" fsck Check an existing filesystem for errors\n"
"\n"
"Startup/shutdown, assembly of multi device filesystems:\n"
+#if 0
" assemble Assemble an existing multi device filesystem\n"
" incremental Incrementally assemble an existing multi device filesystem\n"
" run Start a partially assembled filesystem\n"
" stop Stop a running filesystem\n"
+#endif
"\n"
"Commands for managing a running filesystem:\n"
" fs usage Show disk usage\n"
@@ -150,6 +152,7 @@ int main(int argc, char *argv[])
if (!strcmp(cmd, "fsck"))
return cmd_fsck(argc, argv);
+#if 0
if (!strcmp(cmd, "assemble"))
return cmd_assemble(argc, argv);
if (!strcmp(cmd, "incremental"))
@@ -158,6 +161,7 @@ int main(int argc, char *argv[])
return cmd_run(argc, argv);
if (!strcmp(cmd, "stop"))
return cmd_stop(argc, argv);
+#endif
if (!strcmp(cmd, "fs"))
return fs_cmds(argc, argv);
diff --git a/cmd_assemble.c b/cmd_assemble.c
index 57b28026..a997e1e1 100644
--- a/cmd_assemble.c
+++ b/cmd_assemble.c
@@ -11,6 +11,7 @@
#include "cmds.h"
#include "libbcachefs.h"
+#if 0
int cmd_assemble(int argc, char *argv[])
{
unsigned nr_devs = argc - 1;
@@ -26,7 +27,7 @@ int cmd_assemble(int argc, char *argv[])
unsigned i;
for (i = 0; i < nr_devs; i++)
- assemble->devs[i] = (__u64) argv[i + 1];
+ assemble->devs[i] = (unsigned long) argv[i + 1];
xioctl(bcachectl_open(), BCH_IOCTL_ASSEMBLE, assemble);
return 0;
@@ -38,9 +39,10 @@ int cmd_incremental(int argc, char *argv[])
die("Please supply exactly one device");
struct bch_ioctl_incremental incremental = {
- .dev = (__u64) argv[1],
+ .dev = (unsigned long) argv[1],
};
xioctl(bcachectl_open(), BCH_IOCTL_INCREMENTAL, &incremental);
return 0;
}
+#endif
diff --git a/cmd_debug.c b/cmd_debug.c
index 6c2b3184..11d73b35 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -10,6 +10,7 @@
#include "libbcachefs/bcachefs.h"
#include "libbcachefs/alloc.h"
+#include "libbcachefs/bset.h"
#include "libbcachefs/btree_cache.h"
#include "libbcachefs/btree_iter.h"
#include "libbcachefs/buckets.h"
diff --git a/cmd_run.c b/cmd_run.c
index 673d519a..1bf84e5c 100644
--- a/cmd_run.c
+++ b/cmd_run.c
@@ -15,6 +15,7 @@
#include "cmds.h"
#include "libbcachefs.h"
+#if 0
int cmd_run(int argc, char *argv[])
{
return 0;
@@ -29,3 +30,4 @@ int cmd_stop(int argc, char *argv[])
xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
return 0;
}
+#endif
diff --git a/cmds.h b/cmds.h
index 6d21db6f..258a823d 100644
--- a/cmds.h
+++ b/cmds.h
@@ -12,10 +12,12 @@
int cmd_format(int argc, char *argv[]);
int cmd_show_super(int argc, char *argv[]);
+#if 0
int cmd_assemble(int argc, char *argv[]);
int cmd_incremental(int argc, char *argv[]);
int cmd_run(int argc, char *argv[]);
int cmd_stop(int argc, char *argv[]);
+#endif
int cmd_fs_usage(int argc, char *argv[]);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 363f26a4..9667acf9 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -6,27 +6,22 @@
struct timer_list {
unsigned long expires;
- void (*function)(unsigned long);
- unsigned long data;
+ void (*function)(struct timer_list *timer);
bool pending;
};
-static inline void init_timer(struct timer_list *timer)
+static inline void timer_setup(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags)
{
memset(timer, 0, sizeof(*timer));
+ timer->function = func;
}
-#define __init_timer(_timer, _flags) init_timer(_timer)
+#define timer_setup_on_stack(timer, callback, flags) \
+ timer_setup(timer, callback, flags)
-#define __setup_timer(_timer, _fn, _data, _flags) \
- do { \
- __init_timer((_timer), (_flags)); \
- (_timer)->function = (_fn); \
- (_timer)->data = (_data); \
- } while (0)
-
-#define setup_timer(timer, fn, data) \
- __setup_timer((timer), (fn), (data), 0)
+#define destroy_timer_on_stack(timer) do {} while (0)
static inline int timer_pending(const struct timer_list *timer)
{
@@ -36,8 +31,9 @@ static inline int timer_pending(const struct timer_list *timer)
int del_timer(struct timer_list * timer);
int del_timer_sync(struct timer_list *timer);
+#define del_singleshot_timer_sync(timer) del_timer_sync(timer)
+
int mod_timer(struct timer_list *timer, unsigned long expires);
-//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
static inline void add_timer(struct timer_list *timer)
{
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 213562f2..1406c958 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -8,7 +8,7 @@ struct task_struct;
struct workqueue_struct;
struct work_struct;
typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(unsigned long __data);
+void delayed_work_timer_fn(struct timer_list *);
#define work_data_bits(work) ((unsigned long *)(&(work)->data))
@@ -44,9 +44,7 @@ struct delayed_work {
#define INIT_DELAYED_WORK(_work, _func) \
do { \
INIT_WORK(&(_work)->work, (_func)); \
- __setup_timer(&(_work)->timer, delayed_work_timer_fn, \
- (unsigned long)(_work), \
- TIMER_IRQSAFE); \
+ timer_setup(&(_work)->timer, delayed_work_timer_fn, 0); \
} while (0)
static inline struct delayed_work *to_delayed_work(struct work_struct *work)
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index 256adb51..44f9479e 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -1393,12 +1393,10 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
{
int i;
- for (i = wp->first_ptr - 1; i >= 0; --i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
- if (dev_in_target(ca, target) == in_target)
+ for (i = wp->first_ptr - 1; i >= 0; --i)
+ if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+ target) == in_target)
writepoint_drop_ptr(c, wp, i);
- }
}
static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
@@ -1555,7 +1553,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
/* does writepoint have ptrs we don't want to use? */
if (target)
writepoint_for_each_ptr(wp, ob, i)
- if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+ if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
}
@@ -1590,7 +1588,8 @@ alloc_done:
* one in the target we want:
*/
if (cache_idx >= 0) {
- if (!dev_in_target(ca, target)) {
+ if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+ target)) {
writepoint_drop_ptr(c, wp, i);
} else {
writepoint_drop_ptr(c, wp, cache_idx);
@@ -1621,7 +1620,7 @@ alloc_done:
if (ca->mi.durability &&
ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
- !dev_idx_in_target(c, ob->ptr.dev, target)) {
+ !bch2_dev_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
nr_ptrs_effective -= ca->mi.durability;
@@ -1890,8 +1889,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
/* stop allocator thread: */
void bch2_dev_allocator_stop(struct bch_dev *ca)
{
- struct task_struct *p = ca->alloc_thread;
+ struct task_struct *p;
+ p = rcu_dereference_protected(ca->alloc_thread, 1);
ca->alloc_thread = NULL;
/*
@@ -1926,7 +1926,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return PTR_ERR(p);
get_task_struct(p);
- ca->alloc_thread = p;
+ rcu_assign_pointer(ca->alloc_thread, p);
wake_up_process(p);
return 0;
}
@@ -2099,7 +2099,7 @@ again:
if (btree_node_dirty(b) && (!b->written || b->level)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
- six_lock_read(&b->lock);
+ btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index 372cc047..00d01f46 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -103,7 +103,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
struct task_struct *p;
rcu_read_lock();
- if ((p = READ_ONCE(ca->alloc_thread)))
+ p = rcu_dereference(ca->alloc_thread);
+ if (p)
wake_up_process(p);
rcu_read_unlock();
}
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 206c30f4..879bde20 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -197,7 +197,6 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
-#include "bset.h"
#include "fifo.h"
#include "opts.h"
#include "util.h"
@@ -271,26 +270,38 @@ do { \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
-#define BCH_TIME_STATS() \
- BCH_TIME_STAT(btree_node_mem_alloc) \
- BCH_TIME_STAT(btree_gc) \
- BCH_TIME_STAT(btree_split) \
- BCH_TIME_STAT(btree_sort) \
- BCH_TIME_STAT(btree_read) \
- BCH_TIME_STAT(data_write) \
- BCH_TIME_STAT(data_read) \
- BCH_TIME_STAT(data_promote) \
- BCH_TIME_STAT(journal_write) \
- BCH_TIME_STAT(journal_delay) \
- BCH_TIME_STAT(journal_blocked) \
- BCH_TIME_STAT(journal_flush_seq)
+#define BCH_TIME_STATS() \
+ x(btree_node_mem_alloc) \
+ x(btree_gc) \
+ x(btree_split) \
+ x(btree_sort) \
+ x(btree_read) \
+ x(btree_lock_contended_read) \
+ x(btree_lock_contended_intent) \
+ x(btree_lock_contended_write) \
+ x(data_write) \
+ x(data_read) \
+ x(data_promote) \
+ x(journal_write) \
+ x(journal_delay) \
+ x(journal_blocked) \
+ x(journal_flush_seq)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+ BCH_TIME_STATS()
+#undef x
+ BCH_TIME_STAT_NR
+};
#include "alloc_types.h"
+#include "btree_types.h"
#include "buckets_types.h"
#include "clock_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "quota_types.h"
+#include "rebalance_types.h"
#include "super_types.h"
/*
@@ -372,7 +383,7 @@ struct bch_dev {
struct bch_dev_usage usage_cached;
/* Allocator: */
- struct task_struct *alloc_thread;
+ struct task_struct __rcu *alloc_thread;
/*
* free: Buckets that are ready to be used
@@ -447,7 +458,6 @@ enum {
/* shutdown: */
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
- BCH_FS_GC_STOPPING,
/* errors: */
BCH_FS_ERROR,
@@ -570,12 +580,6 @@ struct bch_fs {
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
- /* REBALANCE */
- struct task_struct *rebalance_thread;
- struct bch_pd_controller rebalance_pd;
-
- atomic64_t rebalance_work_unknown_dev;
-
struct bch_devs_mask rw_devs[BCH_DATA_NR];
u64 capacity; /* sectors */
@@ -664,6 +668,9 @@ struct bch_fs {
atomic64_t key_version;
+ /* REBALANCE */
+ struct bch_fs_rebalance rebalance;
+
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
@@ -714,18 +721,13 @@ struct bch_fs {
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
- unsigned rebalance_enabled:1;
- unsigned rebalance_percent;
bool promote_whole_extents;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
-#define BCH_TIME_STAT(name) \
- struct time_stats name##_time;
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+ struct time_stats times[BCH_TIME_STAT_NR];
};
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 48d14a30..ab8b9446 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -3,6 +3,72 @@
/*
* bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ * - superblock
+ * - journal
+ * - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
*/
#include <asm/types.h>
@@ -44,12 +110,19 @@ struct bkey_format {
/* Btree keys - all units are in sectors */
struct bpos {
- /* Word order matches machine byte order */
-#if defined(__LITTLE_ENDIAN)
+ /*
+ * Word order matches machine byte order - btree code treats a bpos as a
+ * single large integer, for search/comparison purposes
+ *
+ * Note that wherever a bpos is embedded in another on disk data
+ * structure, it has to be byte swabbed when reading in metadata that
+ * wasn't written in native endian order:
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u32 snapshot;
__u64 offset;
__u64 inode;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u64 inode;
__u64 offset; /* Points to end of extent - sectors */
__u32 snapshot;
@@ -83,10 +156,10 @@ struct bch_val {
};
struct bversion {
-#if defined(__LITTLE_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u64 lo;
__u32 hi;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u32 hi;
__u64 lo;
#endif
@@ -110,13 +183,13 @@ struct bkey {
/* Type of the value */
__u8 type;
-#if defined(__LITTLE_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
struct bversion version;
__u32 size; /* extent size, in sectors */
struct bpos p;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
struct bversion version;
@@ -275,10 +348,10 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE);
*
* If an extent is not checksummed or compressed, when the extent is trimmed we
* don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the start of the data that
- * is currently live. The size field in struct bkey records the current (live)
- * size of the extent, and is also used to mean "size of region on disk that we
- * point to" in this case.
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
*
* Thus an extent that is not checksummed or compressed will consist only of a
* list of bch_extent_ptrs, with none of the fields in
@@ -446,11 +519,11 @@ struct bch_extent_crc128 {
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 compression_type:4,
csum_type:4,
- nonce:14,
+ nonce:13,
offset:13,
_uncompressed_size:13,
_compressed_size:13,
- type:3;
+ type:4;
#endif
struct bch_csum csum;
} __attribute__((packed, aligned(8)));
@@ -496,7 +569,7 @@ struct bch_extent_reservation {
};
union bch_extent_entry {
-#if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
unsigned long type;
#elif __BITS_PER_LONG == 32
struct {
@@ -551,10 +624,11 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION);
sizeof(struct bch_extent_ptr)) / sizeof(u64))
/* Maximum possible size of an entire extent value: */
-/* There's a hack in the keylist code that needs to be fixed.. */
#define BKEY_EXTENT_VAL_U64s_MAX \
(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@@ -1378,33 +1452,4 @@ struct btree_node_entry {
};
} __attribute__((packed, aligned(8)));
-/* Obsolete: */
-
-struct prio_set {
- struct bch_csum csum;
-
- __le64 magic;
- __le32 nonce[3];
- __le16 version;
- __le16 flags;
-
- __u8 encrypted_start[0];
-
- __le64 next_bucket;
-
- struct bucket_disk {
- __le16 prio[2];
- __u8 gen;
- } __attribute__((packed)) data[];
-} __attribute__((packed, aligned(8)));
-
-LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
-
-#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
-
-static inline __u64 __pset_magic(struct bch_sb *sb)
-{
- return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
-}
-
#endif /* _BCACHEFS_FORMAT_H */
diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h
index 6578847b..73e5d887 100644
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@@ -5,6 +5,9 @@
#include <asm/ioctl.h>
#include "bcachefs_format.h"
+/*
+ * Flags common to multiple ioctls:
+ */
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
@@ -14,12 +17,23 @@
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
#define BCH_BY_INDEX (1 << 4)
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
#define BCH_READ_DEV (1 << 5)
/* global control dev: */
+/* These are currently broken, and probably unnecessary: */
+#if 0
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
@@ -35,12 +49,18 @@ struct bch_ioctl_incremental {
__u64 pad;
__u64 dev;
};
+#endif
/* filesystem ioctls: */
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
#define BCH_IOCTL_STOP _IO(0xbc, 3)
+#endif
+
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
@@ -52,14 +72,70 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
struct bch_ioctl_query_uuid {
uuid_le uuid;
};
+#if 0
struct bch_ioctl_start {
__u32 flags;
__u32 pad;
};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
struct bch_ioctl_disk {
__u32 flags;
@@ -67,6 +143,16 @@ struct bch_ioctl_disk {
__u64 dev;
};
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state - one of the bch_member_state states (rw, ro, failed,
+ * spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
struct bch_ioctl_disk_set_state {
__u32 flags;
__u8 new_state;
@@ -81,6 +167,15 @@ enum bch_data_ops {
BCH_DATA_OP_NR = 3,
};
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
struct bch_ioctl_data {
__u32 op;
__u32 flags;
@@ -93,9 +188,18 @@ struct bch_ioctl_data {
__u32 dev;
__u32 pad;
} migrate;
+ struct {
+ __u64 pad[8];
+ };
};
} __attribute__((packed, aligned(8)));
+enum bch_data_event {
+ BCH_DATA_EVENT_PROGRESS = 0,
+ /* XXX: add an event for reporting errors */
+ BCH_DATA_EVENT_NR = 1,
+};
+
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
@@ -106,6 +210,15 @@ struct bch_ioctl_data_progress {
__u64 sectors_total;
} __attribute__((packed, aligned(8)));
+struct bch_ioctl_data_event {
+ __u8 type;
+ __u8 pad[7];
+ union {
+ struct bch_ioctl_data_progress p;
+ __u64 pad2[15];
+ };
+} __attribute__((packed, aligned(8)));
+
struct bch_ioctl_dev_usage {
__u8 state;
__u8 alive;
@@ -127,6 +240,19 @@ struct bch_ioctl_fs_usage {
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
};
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices - number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
struct bch_ioctl_usage {
__u16 nr_devices;
__u16 pad[3];
@@ -135,6 +261,20 @@ struct bch_ioctl_usage {
struct bch_ioctl_dev_usage devs[0];
};
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb - buffer to read into
+ * @size - size of userspace allocated buffer
+ * @dev - device to read superblock for, if BCH_READ_DEV flag is
+ * specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
struct bch_ioctl_read_super {
__u32 flags;
__u32 pad;
@@ -143,10 +283,22 @@ struct bch_ioctl_read_super {
__u64 sb;
};
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
struct bch_ioctl_disk_get_idx {
__u64 dev;
};
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev - member to resize
+ * @nbuckets - new number of buckets
+ */
struct bch_ioctl_disk_resize {
__u32 flags;
__u32 pad;
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index f665e2e1..2d6c8a23 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -13,8 +13,6 @@
void bch2_to_binary(char *, const u64 *, unsigned);
-#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
@@ -590,25 +588,31 @@ BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
/* byte order helpers */
-#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
-#error edit for your odd byteorder.
-#endif
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#ifdef __LITTLE_ENDIAN
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+ return f->key_u64s - 1;
+}
#define high_bit_offset 0
-#define __high_word(u64s, k) ((k)->_data + (u64s) - 1)
#define nth_word(p, n) ((p) - (n))
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+ return 0;
+}
#define high_bit_offset KEY_PACKED_BITS_START
-#define __high_word(u64s, k) ((k)->_data)
#define nth_word(p, n) ((p) + (n))
+#else
+#error edit for your odd byteorder.
#endif
-#define high_word(format, k) __high_word((format)->key_u64s, k)
+#define high_word(f, k) ((k)->_data + high_word_offset(f))
#define next_word(p) nth_word(p, 1)
#define prev_word(p) nth_word(p, -1)
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 92046ae4..9a274774 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -6,6 +6,7 @@
*/
#include "bcachefs.h"
+#include "btree_cache.h"
#include "bset.h"
#include "eytzinger.h"
#include "util.h"
@@ -438,6 +439,10 @@ void bch2_btree_keys_free(struct btree *b)
b->aux_data = NULL;
}
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
{
b->page_order = page_order;
@@ -672,7 +677,7 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
* (and then the bits we want are at the high end, so we shift them
* back down):
*/
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
v >>= f->exponent & 7;
#else
v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
@@ -761,7 +766,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
* Then we calculate the actual shift value, from the start of the key
* (k->_data), to get the key bits starting at exponent:
*/
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
EBUG_ON(shift + bits > b->format.key_u64s * 64);
@@ -964,10 +969,14 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
-void bch2_bset_init_next(struct btree *b, struct bset *i)
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+ struct btree_node_entry *bne)
{
+ struct bset *i = &bne->keys;
struct bset_tree *t;
+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
memset(i, 0, sizeof(*i));
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index cc4ea5d8..153e2b3f 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -157,9 +157,6 @@ static inline bool btree_keys_expensive_checks(const struct btree *b)
#endif
}
-struct btree_node_iter;
-struct btree_node_iter_set;
-
enum bset_aux_tree_type {
BSET_NO_AUX_TREE,
BSET_RO_AUX_TREE,
@@ -342,7 +339,8 @@ int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
void bch2_btree_keys_init(struct btree *, bool *);
void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+ struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
struct bkey_packed *);
@@ -420,14 +418,6 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
/* Btree key iteration */
-struct btree_node_iter {
- u8 is_extents;
-
- struct btree_node_iter_set {
- u16 k, end;
- } data[MAX_BSETS];
-};
-
static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
bool is_extents)
{
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 469f8565..c950f256 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -554,7 +554,8 @@ out:
b->uncompacted_whiteout_u64s = 0;
bch2_btree_keys_init(b, &c->expensive_debug_checks);
- bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+ start_time);
return b;
err:
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index ad51f29c..cd5ebfbe 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -27,6 +27,7 @@
#include <linux/kthread.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
#include <trace/events/bcachefs.h>
struct range_checks {
@@ -264,10 +265,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
gc_pos_set(c, gc_pos_btree_node(b));
- if (max_stale > 32)
+ if (max_stale > 64)
bch2_btree_node_rewrite(c, &iter,
b->data->keys.seq,
BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
else if (!btree_gc_rewrite_disabled(c) &&
(btree_gc_always_rewrite(c) || max_stale > 16))
@@ -557,7 +559,7 @@ void bch2_gc(struct bch_fs *c)
out:
up_write(&c->gc_lock);
trace_gc_end(c);
- bch2_time_stats_update(&c->btree_gc_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
/*
* Wake up allocator in case it was waiting for buckets
@@ -813,6 +815,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
{
struct btree_iter iter;
struct btree *b;
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
unsigned i;
/* Sliding window of adjacent btree nodes */
@@ -859,7 +862,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
lock_seq[0] = merge[0]->lock.state.seq;
- if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
+ if (kthread && kthread_should_stop()) {
bch2_btree_iter_unlock(&iter);
return -ESHUTDOWN;
}
@@ -958,13 +961,15 @@ static int bch2_gc_thread(void *arg)
void bch2_gc_thread_stop(struct bch_fs *c)
{
- set_bit(BCH_FS_GC_STOPPING, &c->flags);
-
- if (c->gc_thread)
- kthread_stop(c->gc_thread);
+ struct task_struct *p;
+ p = c->gc_thread;
c->gc_thread = NULL;
- clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+
+ if (p) {
+ kthread_stop(p);
+ put_task_struct(p);
+ }
}
int bch2_gc_thread_start(struct bch_fs *c)
@@ -973,12 +978,13 @@ int bch2_gc_thread_start(struct bch_fs *c)
BUG_ON(c->gc_thread);
- p = kthread_create(bch2_gc_thread, c, "bcache_gc");
+ p = kthread_create(bch2_gc_thread, c, "bch_gc");
if (IS_ERR(p))
return PTR_ERR(p);
+ get_task_struct(p);
c->gc_thread = p;
- wake_up_process(c->gc_thread);
+ wake_up_process(p);
return 0;
}
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 1aa94229..74ffad4c 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -627,7 +627,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
if (sorting_entire_node)
- bch2_time_stats_update(&c->btree_sort_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+ start_time);
/* Make sure we preserve bset journal_seq: */
for (t = b->set + start_idx; t < b->set + end_idx; t++)
@@ -801,7 +802,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
&dst->format,
true);
- bch2_time_stats_update(&c->btree_sort_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
set_btree_bset_end(dst, dst->set);
@@ -877,7 +878,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(b, &bne->keys);
+ bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);
@@ -1382,7 +1383,7 @@ start:
}
}
- bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
bio_put(&rb->bio);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1742,6 +1743,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
BUG_ON((b->will_make_reachable != 0) != !b->written);
BUG_ON(b->written >= c->opts.btree_node_size);
+ BUG_ON(b->written & (c->opts.block_size - 1));
BUG_ON(bset_written(b, btree_bset_last(b)));
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@@ -1972,7 +1974,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(b, &bne->keys);
+ bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 947685f9..fa154642 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -133,7 +133,7 @@ do { \
\
six_unlock_read(&(_b)->lock); \
btree_node_wait_on_io(_b); \
- six_lock_read(&(_b)->lock); \
+ btree_node_lock_type(c, b, SIX_LOCK_read); \
} \
} while (0)
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 69cad3bb..70c3132e 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -42,37 +42,28 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
six_unlock_write(&b->lock);
}
-void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
{
+ struct bch_fs *c = iter->c;
struct btree_iter *linked;
unsigned readers = 0;
- EBUG_ON(iter->l[b->level].b != b);
- EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
-
- if (six_trylock_write(&b->lock))
- return;
-
for_each_linked_btree_iter(iter, linked)
if (linked->l[b->level].b == b &&
btree_node_read_locked(linked, b->level))
readers++;
- if (likely(!readers)) {
- six_lock_write(&b->lock);
- } else {
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- atomic64_sub(__SIX_VAL(read_lock, readers),
- &b->lock.state.counter);
- six_lock_write(&b->lock);
- atomic64_add(__SIX_VAL(read_lock, readers),
- &b->lock.state.counter);
- }
+ /*
+ * Must drop our read locks before calling six_lock_write() -
+ * six_unlock() won't do wakeups until the reader count
+ * goes to 0, and it's safe because we have the node intent
+ * locked:
+ */
+ atomic64_sub(__SIX_VAL(read_lock, readers),
+ &b->lock.state.counter);
+ btree_node_lock_type(c, b, SIX_LOCK_write);
+ atomic64_add(__SIX_VAL(read_lock, readers),
+ &b->lock.state.counter);
}
bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
@@ -135,6 +126,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
struct btree_iter *iter,
enum six_lock_type type)
{
+ struct bch_fs *c = iter->c;
struct btree_iter *linked;
/* Can't have children locked before ancestors: */
@@ -206,7 +198,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
}
}
- six_lock_type(&b->lock, type);
+ __btree_node_lock_type(c, b, type);
return true;
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 95191ba2..0097a2a2 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -4,72 +4,6 @@
#include <linux/dynamic_fault.h>
#include "btree_types.h"
-#include "bset.h"
-
-#define BTREE_ITER_SLOTS (1 << 0)
-#define BTREE_ITER_INTENT (1 << 1)
-#define BTREE_ITER_PREFETCH (1 << 2)
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
- */
-#define BTREE_ITER_IS_EXTENTS (1 << 3)
-/*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
- */
-#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
-#define BTREE_ITER_ERROR (1 << 5)
-
-enum btree_iter_uptodate {
- BTREE_ITER_UPTODATE = 0,
- BTREE_ITER_NEED_PEEK = 1,
- BTREE_ITER_NEED_RELOCK = 2,
- BTREE_ITER_NEED_TRAVERSE = 3,
- BTREE_ITER_END = 4,
-};
-
-/*
- * @pos - iterator's current position
- * @level - current btree depth
- * @locks_want - btree level below which we start taking intent locks
- * @nodes_locked - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
- struct bch_fs *c;
- struct bpos pos;
-
- u8 flags;
- unsigned uptodate:4;
- enum btree_id btree_id:4;
- unsigned level:4,
- locks_want:4,
- nodes_locked:4,
- nodes_intent_locked:4;
-
- struct btree_iter_level {
- struct btree *b;
- struct btree_node_iter iter;
- } l[BTREE_MAX_DEPTH];
-
- u32 lock_seq[BTREE_MAX_DEPTH];
-
- /*
- * Current unpacked key - so that bch2_btree_iter_next()/
- * bch2_btree_iter_next_slot() can correctly advance pos.
- */
- struct bkey k;
-
- /*
- * Circular linked list of linked iterators: linked iterators share
- * locks (e.g. two linked iterators may have the same node intent
- * locked, or read and write locked, at the same time), and insertions
- * through one iterator won't invalidate the other linked iterators.
- */
-
- /* Must come last: */
- struct btree_iter *next;
-};
static inline void btree_iter_set_dirty(struct btree_iter *iter,
enum btree_iter_uptodate u)
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 0581f44a..f48084bc 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -98,6 +98,39 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
mark_btree_node_unlocked(iter, level);
}
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+ switch (type) {
+ case SIX_LOCK_read:
+ return BCH_TIME_btree_lock_contended_read;
+ case SIX_LOCK_intent:
+ return BCH_TIME_btree_lock_contended_intent;
+ case SIX_LOCK_write:
+ return BCH_TIME_btree_lock_contended_write;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+ enum six_lock_type type)
+{
+ u64 start_time = local_clock();
+
+ six_lock_type(&b->lock, type);
+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+ enum six_lock_type type)
+{
+ if (!six_trylock_type(&b->lock, type))
+ __btree_node_lock_type(c, b, type);
+}
+
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type);
@@ -125,7 +158,17 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
-void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+ EBUG_ON(iter->l[b->level].b != b);
+ EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+ if (!six_trylock_write(&b->lock))
+ __bch2_btree_node_lock_write(b, iter);
+}
#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 8854305d..f62c96d9 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -176,6 +176,79 @@ struct btree_cache {
struct closure_waitlist alloc_wait;
};
+struct btree_node_iter {
+ u8 is_extents;
+
+ struct btree_node_iter_set {
+ u16 k, end;
+ } data[MAX_BSETS];
+};
+
+#define BTREE_ITER_SLOTS (1 << 0)
+#define BTREE_ITER_INTENT (1 << 1)
+#define BTREE_ITER_PREFETCH (1 << 2)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS (1 << 3)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
+#define BTREE_ITER_ERROR (1 << 5)
+
+enum btree_iter_uptodate {
+ BTREE_ITER_UPTODATE = 0,
+ BTREE_ITER_NEED_PEEK = 1,
+ BTREE_ITER_NEED_RELOCK = 2,
+ BTREE_ITER_NEED_TRAVERSE = 3,
+ BTREE_ITER_END = 4,
+};
+
+/*
+ * @pos - iterator's current position
+ * @level - current btree depth
+ * @locks_want - btree level below which we start taking intent locks
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+ struct bch_fs *c;
+ struct bpos pos;
+
+ u8 flags;
+ unsigned uptodate:4;
+ enum btree_id btree_id:4;
+ unsigned level:4,
+ locks_want:4,
+ nodes_locked:4,
+ nodes_intent_locked:4;
+
+ struct btree_iter_level {
+ struct btree *b;
+ struct btree_node_iter iter;
+ } l[BTREE_MAX_DEPTH];
+
+ u32 lock_seq[BTREE_MAX_DEPTH];
+
+ /*
+ * Current unpacked key - so that bch2_btree_iter_next()/
+ * bch2_btree_iter_next_slot() can correctly advance pos.
+ */
+ struct bkey k;
+
+ /*
+ * Circular linked list of linked iterators: linked iterators share
+ * locks (e.g. two linked iterators may have the same node intent
+ * locked, or read and write locked, at the same time), and insertions
+ * through one iterator won't invalidate the other linked iterators.
+ */
+
+ /* Must come last: */
+ struct btree_iter *next;
+};
+
#define BTREE_FLAG(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index adba3092..c3ecc1e9 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -237,7 +237,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
clear_btree_node_noevict(b);
- six_lock_write(&b->lock);
+ btree_node_lock_type(c, b, SIX_LOCK_write);
bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -622,7 +622,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
* b->will_make_reachable prevented it from being written, so
* write it now if it needs to be written:
*/
- six_lock_read(&b->lock);
+ btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
six_unlock_read(&b->lock);
mutex_lock(&c->btree_interior_update_lock);
@@ -647,8 +647,10 @@ static void btree_update_wait_on_journal(struct closure *cl)
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
if (ret < 0)
goto err;
- if (!ret)
+ if (!ret) {
continue_at(cl, btree_update_wait_on_journal, system_wq);
+ return;
+ }
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
err:
@@ -679,7 +681,7 @@ retry:
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
- six_lock_read(&b->lock);
+ btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto retry;
}
@@ -720,7 +722,7 @@ retry:
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
- six_lock_read(&b->lock);
+ btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto retry;
}
@@ -1456,7 +1458,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_iter_node_replace(iter, n2);
bch2_btree_iter_node_replace(iter, n1);
- bch2_time_stats_update(&c->btree_split_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
}
static void
@@ -1795,8 +1797,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_node_write(c, n, SIX_LOCK_intent);
if (parent) {
- bch2_btree_insert_node(as, parent, iter,
- &keylist_single(&n->key));
+ bch2_keylist_add(&as->parent_keys, &n->key);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
} else {
bch2_btree_set_root(as, n, iter);
}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 3e66d69e..25bfc7ab 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -226,11 +226,30 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
return (void *) i > write_block(b);
}
-static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
- struct bset *i)
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+ struct btree *b,
+ void *end)
{
- return round_up(bset_byte_offset(b, vstruct_end(i)),
- block_bytes(c)) >> 9;
+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+ b->whiteout_u64s +
+ b->uncompacted_whiteout_u64s;
+ ssize_t total = c->opts.btree_node_size << 6;
+
+ return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+ struct btree *b)
+{
+ ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ btree_bkey_last(b, bset_tree_last(b)));
+
+ BUG_ON(remaining < 0);
+
+ if (bset_written(b, btree_bset_last(b)))
+ return 0;
+
+ return remaining;
}
static inline unsigned btree_write_set_buffer(struct btree *b)
@@ -246,20 +265,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
- unsigned offset = max_t(unsigned, b->written << 9,
- bset_byte_offset(b, vstruct_end(i)));
- ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t)
- (offset + sizeof(struct btree_node_entry) +
- b->whiteout_u64s * sizeof(u64) +
- b->uncompacted_whiteout_u64s * sizeof(u64));
-
- EBUG_ON(offset > btree_bytes(c));
-
- if ((unlikely(bset_written(b, i)) &&
- remaining_space > block_bytes(c)) ||
- (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
- remaining_space > btree_write_set_buffer(b)))
- return (void *) b->data + offset;
+ struct btree_node_entry *bne = max(write_block(b),
+ (void *) btree_bkey_last(b, bset_tree_last(b)));
+ ssize_t remaining_space =
+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+ if (unlikely(bset_written(b, i))) {
+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+ return bne;
+ } else {
+ if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+ return bne;
+ }
return NULL;
}
@@ -285,23 +303,6 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
}
}
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
-{
- struct bset *i = btree_bset_last(b);
- unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
- b->whiteout_u64s +
- b->uncompacted_whiteout_u64s;
- unsigned total = c->opts.btree_node_size << 6;
-
- EBUG_ON(used > total);
-
- if (bset_written(b, i))
- return 0;
-
- return total - used;
-}
-
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 92fb5f61..cc41140f 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -108,7 +108,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
- six_lock_read(&b->lock);
+ btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w &&
w->journal.pin_list == journal_seq_pin(j, seq)));
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 1f944cb8..5dda22c7 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -555,9 +555,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
return;
}
- v = READ_ONCE(g->_mark.counter);
+ v = atomic64_read(&g->_mark.v);
do {
- new.counter = old.counter = v;
+ new.v.counter = old.v.counter = v;
saturated = 0;
/*
@@ -600,9 +600,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
g->_mark = new;
break;
}
- } while ((v = cmpxchg(&g->_mark.counter,
- old.counter,
- new.counter)) != old.counter);
+ } while ((v = atomic64_cmpxchg(&g->_mark.v,
+ old.v.counter,
+ new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, old, new);
@@ -957,7 +957,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
kvpfree(ca->buckets_dirty,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
- kvpfree(ca->buckets, sizeof(struct bucket_array) +
+ kvpfree(rcu_dereference_protected(ca->buckets, 1),
+ sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage_percpu);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 01f0b314..aefe6027 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -16,15 +16,15 @@
#define bucket_cmpxchg(g, new, expr) \
({ \
- u64 _v = READ_ONCE((g)->_mark.counter); \
+ u64 _v = atomic64_read(&(g)->_mark.v); \
struct bucket_mark _old; \
\
do { \
- (new).counter = _old.counter = _v; \
+ (new).v.counter = _old.v.counter = _v; \
expr; \
- } while ((_v = cmpxchg(&(g)->_mark.counter, \
- _old.counter, \
- (new).counter)) != _old.counter);\
+ } while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
+ _old.v.counter, \
+ (new).v.counter)) != _old.v.counter);\
_old; \
})
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 28bd2c59..10f00861 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -6,7 +6,7 @@
struct bucket_mark {
union {
struct {
- u64 counter;
+ atomic64_t v;
};
struct {
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 8403bae6..5593b9a1 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -54,6 +54,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
return ca;
}
+#if 0
static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
{
struct bch_ioctl_assemble arg;
@@ -127,14 +128,17 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
return 0;
}
+#endif
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
switch (cmd) {
+#if 0
case BCH_IOCTL_ASSEMBLE:
return bch2_ioctl_assemble(arg);
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
+#endif
default:
return -ENOTTY;
}
@@ -148,6 +152,7 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
sizeof(c->sb.user_uuid));
}
+#if 0
static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
{
if (arg.flags || arg.pad)
@@ -161,6 +166,7 @@ static long bch2_ioctl_stop(struct bch_fs *c)
bch2_fs_stop(c);
return 0;
}
+#endif
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
{
@@ -294,18 +300,19 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
{
struct bch_data_ctx *ctx = file->private_data;
struct bch_fs *c = ctx->c;
- struct bch_ioctl_data_progress p = {
- .data_type = ctx->stats.data_type,
- .btree_id = ctx->stats.iter.btree_id,
- .pos = ctx->stats.iter.pos,
- .sectors_done = atomic64_read(&ctx->stats.sectors_seen),
- .sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+ struct bch_ioctl_data_event e = {
+ .type = BCH_DATA_EVENT_PROGRESS,
+ .p.data_type = ctx->stats.data_type,
+ .p.btree_id = ctx->stats.iter.btree_id,
+ .p.pos = ctx->stats.iter.pos,
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
+ .p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
};
- if (len != sizeof(p))
+ if (len < sizeof(e))
return -EINVAL;
- return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p);
+ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
@@ -419,7 +426,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (ca->dev_idx >= arg.nr_devices) {
percpu_ref_put(&ca->ref);
- return -ENOSPC;
+ return -ERANGE;
}
if (percpu_ref_tryget(&ca->io_ref)) {
@@ -539,10 +546,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return -EPERM;
switch (cmd) {
+#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
case BCH_IOCTL_STOP:
return bch2_ioctl_stop(c);
+#endif
case BCH_IOCTL_READ_SUPER:
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
case BCH_IOCTL_DISK_GET_IDX:
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 6d8543eb..28d086bc 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -421,7 +421,7 @@ static struct bch_csum bch2_checksum_merge(unsigned type,
BUG_ON(!bch2_checksum_mergeable(type));
while (b_len) {
- unsigned b = min(b_len, PAGE_SIZE);
+ unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
a.lo = bch2_checksum_update(type, a.lo,
page_address(ZERO_PAGE(0)), b);
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index 650be8ce..c67376f9 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -42,7 +42,8 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
}
struct io_clock_wait {
- struct io_timer timer;
+ struct io_timer io_timer;
+ struct timer_list cpu_timer;
struct task_struct *task;
int expired;
};
@@ -50,7 +51,16 @@ struct io_clock_wait {
static void io_clock_wait_fn(struct io_timer *timer)
{
struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, timer);
+ struct io_clock_wait, io_timer);
+
+ wait->expired = 1;
+ wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+ struct io_clock_wait *wait = container_of(timer,
+ struct io_clock_wait, cpu_timer);
wait->expired = 1;
wake_up_process(wait->task);
@@ -61,35 +71,38 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
struct io_clock_wait wait;
/* XXX: calculate sleep time rigorously */
- wait.timer.expire = until;
- wait.timer.fn = io_clock_wait_fn;
+ wait.io_timer.expire = until;
+ wait.io_timer.fn = io_clock_wait_fn;
wait.task = current;
wait.expired = 0;
- bch2_io_timer_add(clock, &wait.timer);
+ bch2_io_timer_add(clock, &wait.io_timer);
schedule();
- bch2_io_timer_del(clock, &wait.timer);
+ bch2_io_timer_del(clock, &wait.io_timer);
}
-/*
- * _only_ to be used from a kthread
- */
void bch2_kthread_io_clock_wait(struct io_clock *clock,
- unsigned long until)
+ unsigned long io_until,
+ unsigned long cpu_timeout)
{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait;
- /* XXX: calculate sleep time rigorously */
- wait.timer.expire = until;
- wait.timer.fn = io_clock_wait_fn;
+ wait.io_timer.expire = io_until;
+ wait.io_timer.fn = io_clock_wait_fn;
wait.task = current;
wait.expired = 0;
- bch2_io_timer_add(clock, &wait.timer);
+ bch2_io_timer_add(clock, &wait.io_timer);
+
+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
+ if (kthread && kthread_should_stop())
break;
if (wait.expired)
@@ -100,7 +113,9 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
}
__set_current_state(TASK_RUNNING);
- bch2_io_timer_del(clock, &wait.timer);
+ del_singleshot_timer_sync(&wait.cpu_timer);
+ destroy_timer_on_stack(&wait.cpu_timer);
+ bch2_io_timer_del(clock, &wait.io_timer);
}
static struct io_timer *get_expired_timer(struct io_clock *clock,
diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h
index af6b2b39..1e2a7dea 100644
--- a/libbcachefs/clock.h
+++ b/libbcachefs/clock.h
@@ -3,7 +3,8 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+ unsigned long);
void bch2_increment_clock(struct bch_fs *, unsigned, int);
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 1af62621..6379905b 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -480,7 +480,7 @@ static const unsigned bch2_compression_opt_to_feature[] = {
#undef BCH_FEATURE_NONE
-int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
{
int ret = 0;
@@ -529,26 +529,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
mempool_exit(&c->compression_bounce[READ]);
}
-static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t)pool_data;
- return kvpmalloc(size, gfp_mask);
-}
-
-void mempool_kvpfree(void *element, void *pool_data)
-{
- size_t size = (size_t)pool_data;
- kvpfree(element, size);
-}
-
-static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return !mempool_initialized(pool)
- ? mempool_init(pool, min_nr, mempool_kvpmalloc,
- mempool_kvpfree, (void *) size)
- : 0;
-}
-
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t max_extent = c->sb.encoded_extent_max << 9;
@@ -611,6 +591,9 @@ have_compressed:
if (i->decompress_workspace)
decompress_workspace_needed = true;
+ if (mempool_initialized(&c->compress_workspace[i->type]))
+ continue;
+
ret = mempool_init_kvpmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace);
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index c129a33e..cd200cbe 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -16,8 +16,8 @@ static int group_cmp(const void *_l, const void *_r)
strncmp(l->label, r->label, sizeof(l->label));
}
-const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+ struct bch_sb_field *f)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
@@ -162,7 +162,8 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
}
}
- old_g = c->disk_groups;
+ old_g = rcu_dereference_protected(c->disk_groups,
+ lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->disk_groups, cpu_g);
if (old_g)
kfree_rcu(old_g, rcu);
@@ -193,6 +194,36 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
}
}
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+ struct target t = target_decode(target);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ return false;
+ case TARGET_DEV:
+ return dev == t.dev;
+ case TARGET_GROUP: {
+ struct bch_disk_groups_cpu *g;
+ const struct bch_devs_mask *m;
+ bool ret;
+
+ rcu_read_lock();
+ g = rcu_dereference(c->disk_groups);
+ m = t.group < g->nr && !g->entries[t.group].deleted
+ ? &g->entries[t.group].devs
+ : NULL;
+
+ ret = m ? test_bit(dev, m->d) : false;
+ rcu_read_unlock();
+
+ return ret;
+ }
+ default:
+ BUG();
+ }
+}
+
static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
unsigned parent,
const char *name, unsigned namelen)
diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h
index 9da9805a..e92c0dc5 100644
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@@ -53,34 +53,8 @@ static inline struct target target_decode(unsigned target)
return (struct target) { .type = TARGET_NULL };
}
-static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
-{
- struct target t = target_decode(target);
-
- switch (t.type) {
- case TARGET_NULL:
- return false;
- case TARGET_DEV:
- return ca->dev_idx == t.dev;
- case TARGET_GROUP:
- return ca->mi.group && ca->mi.group - 1 == t.group;
- default:
- BUG();
- }
-}
-
-static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
- bool ret;
-
- rcu_read_lock();
- ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
- rcu_read_unlock();
-
- return ret;
-}
-
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 9efaa1ff..b85af711 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -144,7 +144,7 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.group &&
ca->mi.group - 1 == group)
@@ -159,13 +159,11 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
{
const struct bch_extent_ptr *ptr;
- extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
- if (dev_in_target(ca, target) &&
- (!ptr->cached || !ptr_stale(ca, ptr)))
+ extent_for_each_ptr(e, ptr)
+ if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (!ptr->cached ||
+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
return ptr;
- }
return NULL;
}
@@ -732,7 +730,7 @@ err:
bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
"gen %i mark %08x",
err, buf, PTR_BUCKET_NR(ca, ptr),
- mark.gen, (unsigned) mark.counter);
+ mark.gen, (unsigned) mark.v.counter);
}
void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
@@ -2024,7 +2022,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
int n = bch2_extent_ptr_durability(c, ptr);
if (n && n <= extra &&
- !dev_in_target(c->devs[ptr->dev], target)) {
+ !bch2_dev_in_target(c, ptr->dev, target)) {
ptr->cached = true;
extra -= n;
}
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 338e9e01..08ad9647 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -278,24 +278,38 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
.uncompressed_size = k->size,
.live_size = k->size,
};
- case BCH_EXTENT_CRC32:
- return (struct bch_extent_crc_unpacked) {
+ case BCH_EXTENT_CRC32: {
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
- .csum.lo = (__force __le64) crc->crc32.csum,
};
- case BCH_EXTENT_CRC64:
- return (struct bch_extent_crc_unpacked) {
+
+ *((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+ memcpy(&ret.csum.lo, &crc->crc32.csum,
+ sizeof(crc->crc32.csum));
+
+ return ret;
+ }
+ case BCH_EXTENT_CRC64: {
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
.csum.lo = (__force __le64) crc->crc64.csum_lo,
- .csum.hi = (__force __le64) crc->crc64.csum_hi,
};
- case BCH_EXTENT_CRC128:
- return (struct bch_extent_crc_unpacked) {
+
+ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+ return ret;
+ }
+ case BCH_EXTENT_CRC128: {
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc128),
.nonce = crc->crc128.nonce,
.csum = crc->crc128.csum,
};
+
+ return ret;
+ }
default:
BUG();
}
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index a2455b42..1d9464af 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -678,7 +678,7 @@ static void bch2_clear_page_bits(struct page *page)
if (!PagePrivate(page))
return;
- s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
+ s.v = xchg(&page_state(page)->v, 0);
ClearPagePrivate(page);
if (s.dirty_sectors)
@@ -1020,12 +1020,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
- extent_for_each_ptr_crc(e, ptr, crc)
- want_full_extent |= !!crc.csum_type |
- !!crc.compression_type;
+ extent_for_each_crc(e, crc, i)
+ want_full_extent |= ((crc.csum_type != 0) |
+ (crc.compression_type != 0));
}
readpage_bio_extend(readpages_iter,
@@ -1850,8 +1850,7 @@ err_wait_io:
dio->loop = true;
if (!dio->sync) {
- continue_at_noreturn(&dio->cl,
- bch2_dio_write_loop_async, NULL);
+ continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
return -EIOCBQUEUED;
}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 2991a0dd..c554a987 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -610,9 +610,10 @@ static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
{
if (nr >= b->size) {
- size_t new_size = max(max(PAGE_SIZE * 8,
- b->size * 2),
- nr + 1);
+ size_t new_size = max_t(size_t, max_t(size_t,
+ PAGE_SIZE * 8,
+ b->size * 2),
+ nr + 1);
void *n;
new_size = roundup_pow_of_two(new_size);
@@ -642,7 +643,7 @@ struct pathbuf {
static int path_down(struct pathbuf *p, u64 inum)
{
if (p->nr == p->size) {
- size_t new_size = max(256UL, p->size * 2);
+ size_t new_size = max_t(size_t, 256UL, p->size * 2);
void *n = krealloc(p->entries,
new_size * sizeof(p->entries[0]),
GFP_KERNEL);
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index bb656522..3762fb92 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -21,10 +21,10 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "rebalance.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
-#include "tier.h"
#include <linux/blkdev.h>
#include <linux/random.h>
@@ -269,7 +269,7 @@ static void bch2_write_done(struct closure *cl)
percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
- bch2_time_stats_update(&c->data_write_time, op->start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
closure_return(cl);
}
@@ -842,20 +842,24 @@ again:
} while (ret);
continue_at(cl, bch2_write_index, index_update_wq(op));
+ return;
err:
op->error = ret;
continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
? bch2_write_index
: bch2_write_done, index_update_wq(op));
+ return;
flush_io:
closure_sync(cl);
if (!bch2_keylist_empty(&op->insert_keys)) {
__bch2_write_index(op);
- if (op->error)
+ if (op->error) {
continue_at_nobarrier(cl, bch2_write_done, NULL);
+ return;
+ }
}
goto again;
@@ -901,6 +905,7 @@ void bch2_write(struct closure *cl)
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(c, &op->res);
closure_return(cl);
+ return;
}
bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
@@ -974,7 +979,8 @@ static void promote_done(struct closure *cl)
container_of(cl, struct promote_op, cl);
struct bch_fs *c = op->write.op.c;
- bch2_time_stats_update(&c->data_promote_time, op->start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+ op->start_time);
bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
promote_free(c, op);
@@ -1048,7 +1054,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
(*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
bch2_bio_map(&(*rbio)->bio, NULL);
- if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
goto err;
(*rbio)->bounce = true;
@@ -1174,7 +1180,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
- bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+ rbio->start_time);
bio_endio(&rbio->bio);
}
@@ -1486,7 +1493,7 @@ csum_err:
}
bch2_dev_io_error(ca,
- "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
+ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 4cec7bb5..6759810b 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -365,6 +365,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+ unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 2fd0d646..36ba6a4d 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -324,7 +324,7 @@ struct jset_entry_ops {
struct jset_entry *, int);
};
-const struct jset_entry_ops bch2_jset_entry_ops[] = {
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#define x(f, nr) \
[BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
.validate = journal_entry_validate_##f, \
@@ -696,6 +696,7 @@ out:
kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
+ return;
err:
mutex_lock(&jlist->lock);
jlist->ret = ret;
@@ -716,19 +717,6 @@ void bch2_journal_entries_free(struct list_head *list)
}
}
-static inline bool journal_has_keys(struct list_head *list)
-{
- struct journal_replay *i;
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- list_for_each_entry(i, list, list)
- for_each_jset_key(k, _n, entry, &i->j)
- return true;
-
- return false;
-}
-
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
@@ -737,8 +725,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_entry_pin_list *p;
struct bch_dev *ca;
u64 cur_seq, end_seq, seq;
- unsigned iter, keys = 0, entries = 0;
- size_t nr;
+ unsigned iter;
+ size_t entries = 0;
+ u64 nr, keys = 0;
bool degraded = false;
int ret = 0;
@@ -772,9 +761,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
return BCH_FSCK_REPAIR_IMPOSSIBLE;
}
- fsck_err_on(c->sb.clean && journal_has_keys(list), c,
- "filesystem marked clean but journal has keys to replay");
-
list_for_each_entry(i, list, list) {
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
@@ -797,15 +783,27 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
}
}
+ list_for_each_entry(i, list, list) {
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
+
+ for_each_jset_key(k, _n, entry, &i->j)
+ keys++;
+ }
+
i = list_last_entry(list, struct journal_replay, list);
nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
+ fsck_err_on(c->sb.clean && (keys || nr > 1), c,
+ "filesystem marked clean but journal not empty (%llu keys in %llu entries)",
+ keys, nr);
+
if (nr > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
return -ENOMEM;
}
}
@@ -844,8 +842,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
bool blacklisted;
mutex_lock(&j->blacklist_lock);
@@ -867,13 +863,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
journal_last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
entries++;
}
- bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+ bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
keys, entries, journal_cur_seq(j));
fsck_err:
return ret;
@@ -1361,6 +1354,7 @@ void bch2_journal_write(struct closure *cl)
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
+ return;
}
/*
@@ -1417,6 +1411,7 @@ no_io:
ptr->offset += sectors;
continue_at(cl, journal_write_done, system_highpri_wq);
+ return;
err:
bch2_inconsistent_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index b5301d96..567289e2 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -247,7 +247,7 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
if (!bl->nr_entries ||
is_power_of_2(bl->nr_entries)) {
n = krealloc(bl->entries,
- max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+ max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h
index a8c8883b..3106759e 100644
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@@ -55,9 +55,6 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
_k != (_keylist)->top; \
_k = bkey_next(_k))
-#define keylist_single(k) \
- ((struct keylist) { .keys = k, .top = bkey_next(k) })
-
static inline u64 keylist_sectors(struct keylist *keys)
{
struct bkey_i *k;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 0431fb81..3e52b7a2 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -306,16 +306,16 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- if (likely(!io->rbio.bio.bi_status &&
- !io->rbio.hole)) {
- bch2_migrate_read_done(&io->write, &io->rbio);
-
- atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- closure_call(&io->write.op.cl, bch2_write, NULL, cl);
- continue_at(cl, move_write_done, NULL);
+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+ closure_return_with_destructor(cl, move_free);
+ return;
}
- closure_return_with_destructor(cl, move_free);
+ bch2_migrate_read_done(&io->write, &io->rbio);
+
+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+ closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+ continue_at(cl, move_write_done, NULL);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -411,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c,
io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
bch2_bio_map(&io->write.op.wbio.bio, NULL);
- if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
goto err_free;
io->rbio.opts = io_opts;
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index bc98f94b..bc87e067 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "io_types.h"
+#include "move_types.h"
struct bch_read_bio;
struct moving_context;
@@ -48,16 +49,6 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
enum bkey_type, struct bkey_s_c_extent,
struct bch_io_opts *, struct data_opts *);
-struct bch_move_stats {
- enum bch_data_type data_type;
- struct btree_iter iter;
-
- atomic64_t keys_moved;
- atomic64_t sectors_moved;
- atomic64_t sectors_seen;
- atomic64_t sectors_raced;
-};
-
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
struct write_point_specifier,
struct bpos, struct bpos,
diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h
new file mode 100644
index 00000000..832542a8
--- /dev/null
+++ b/libbcachefs/move_types.h
@@ -0,0 +1,14 @@
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+ enum bch_data_type data_type;
+ struct btree_iter iter;
+
+ atomic64_t keys_moved;
+ atomic64_t sectors_moved;
+ atomic64_t sectors_seen;
+ atomic64_t sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 28dabca7..7bef4561 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -241,7 +241,8 @@ static int bch2_copygc_thread(void *arg)
ca->mi.bucket_size;
if (available > reserve) {
next = last + available - reserve;
- bch2_kthread_io_clock_wait(clock, next);
+ bch2_kthread_io_clock_wait(clock, next,
+ MAX_SCHEDULE_TIMEOUT);
continue;
}
@@ -252,7 +253,8 @@ static int bch2_copygc_thread(void *arg)
fragmented = usage.sectors_fragmented;
if (fragmented < reserve) {
next = last + reserve - fragmented;
- bch2_kthread_io_clock_wait(clock, next);
+ bch2_kthread_io_clock_wait(clock, next,
+ MAX_SCHEDULE_TIMEOUT);
continue;
}
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
new file mode 100644
index 00000000..4154b1e9
--- /dev/null
+++ b/libbcachefs/rebalance.c
@@ -0,0 +1,341 @@
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+#include <trace/events/bcachefs.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+ const struct bch_extent_ptr *ptr,
+ struct bch_extent_crc_unpacked crc,
+ struct bch_io_opts *io_opts)
+{
+ if (io_opts->background_target &&
+ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+ !ptr->cached)
+ return true;
+
+ if (io_opts->background_compression &&
+ crc.compression_type !=
+ bch2_compression_opt_to_type[io_opts->background_compression])
+ return true;
+
+ return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+ struct bkey_s_c_extent e;
+
+ if (!bkey_extent_is_data(k.k))
+ return;
+
+ if (!io_opts->background_target &&
+ !io_opts->background_compression)
+ return;
+
+ e = bkey_s_c_to_extent(k);
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (atomic64_add_return(crc.compressed_size,
+ &ca->rebalance_work) ==
+ crc.compressed_size)
+ rebalance_wakeup(c);
+ }
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+ sectors)
+ rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+
+ /* Make sure we have room to add a new pointer: */
+ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+ BKEY_EXTENT_VAL_U64s_MAX)
+ return DATA_SKIP;
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+ goto found;
+
+ return DATA_SKIP;
+found:
+ data_opts->target = io_opts->background_target;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+ int dev_most_full_idx;
+ unsigned dev_most_full_percent;
+ u64 dev_most_full_work;
+ u64 dev_most_full_capacity;
+ u64 total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+ unsigned percent_full;
+ u64 work = dev_work + unknown_dev;
+
+ if (work < dev_work || work < unknown_dev)
+ work = U64_MAX;
+ work = min(work, capacity);
+
+ percent_full = div_u64(work * 100, capacity);
+
+ if (percent_full >= w->dev_most_full_percent) {
+ w->dev_most_full_idx = idx;
+ w->dev_most_full_percent = percent_full;
+ w->dev_most_full_work = work;
+ w->dev_most_full_capacity = capacity;
+ }
+
+ if (w->total_work + dev_work >= w->total_work &&
+ w->total_work + dev_work >= dev_work)
+ w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ struct rebalance_work ret = { .dev_most_full_idx = -1 };
+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+ unsigned i;
+
+ for_each_online_member(ca, c, i)
+ rebalance_work_accumulate(&ret,
+ atomic64_read(&ca->rebalance_work),
+ unknown_dev,
+ bucket_to_sector(ca, ca->mi.nbuckets -
+ ca->mi.first_bucket),
+ i);
+
+ rebalance_work_accumulate(&ret,
+ unknown_dev, 0, c->capacity, -1);
+
+ return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i)
+ atomic64_set(&ca->rebalance_work, 0);
+
+ atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+ return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+ struct bch_fs *c = arg;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct io_clock *clock = &c->io_clock[WRITE];
+ struct rebalance_work w, p;
+ unsigned long start, prev_start;
+ unsigned long prev_run_time, prev_run_cputime;
+ unsigned long cputime, prev_cputime;
+ unsigned long io_start;
+ long throttle;
+
+ set_freezable();
+
+ io_start = atomic_long_read(&clock->now);
+ p = rebalance_work(c);
+ prev_start = jiffies;
+ prev_cputime = curr_cputime();
+
+ while (!kthread_wait_freezable(r->enabled)) {
+ start = jiffies;
+ cputime = curr_cputime();
+
+ prev_run_time = start - prev_start;
+ prev_run_cputime = cputime - prev_cputime;
+
+ w = rebalance_work(c);
+ BUG_ON(!w.dev_most_full_capacity);
+
+ if (!w.total_work) {
+ r->state = REBALANCE_WAITING;
+ kthread_wait_freezable(rebalance_work(c).total_work);
+ continue;
+ }
+
+ /*
+ * If there isn't much work to do, throttle cpu usage:
+ */
+ throttle = prev_run_cputime * 100 /
+ max(1U, w.dev_most_full_percent) -
+ prev_run_time;
+
+ if (w.dev_most_full_percent < 20 && throttle > 0) {
+ r->state = REBALANCE_THROTTLED;
+ r->throttled_until_iotime = io_start +
+ div_u64(w.dev_most_full_capacity *
+ (20 - w.dev_most_full_percent),
+ 50);
+ r->throttled_until_cputime = start + throttle;
+
+ bch2_kthread_io_clock_wait(clock,
+ r->throttled_until_iotime,
+ throttle);
+ continue;
+ }
+
+ /* minimum 1 mb/sec: */
+ r->pd.rate.rate =
+ max_t(u64, 1 << 11,
+ r->pd.rate.rate *
+ max(p.dev_most_full_percent, 1U) /
+ max(w.dev_most_full_percent, 1U));
+
+ io_start = atomic_long_read(&clock->now);
+ p = w;
+ prev_start = start;
+ prev_cputime = cputime;
+
+ r->state = REBALANCE_RUNNING;
+ memset(&r->move_stats, 0, sizeof(r->move_stats));
+ rebalance_work_reset(c);
+
+ bch2_move_data(c,
+ /* ratelimiting disabled for now */
+ NULL, /* &r->pd.rate, */
+ writepoint_ptr(&c->rebalance_write_point),
+ POS_MIN, POS_MAX,
+ rebalance_pred, NULL,
+ &r->move_stats);
+ }
+
+ return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+ char *out = buf, *end = out + PAGE_SIZE;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct rebalance_work w = rebalance_work(c);
+ char h1[21], h2[21];
+
+ bch2_hprint(h1, w.dev_most_full_work << 9);
+ bch2_hprint(h2, w.dev_most_full_capacity << 9);
+ out += scnprintf(out, end - out,
+ "fullest_dev (%i):\t%s/%s\n",
+ w.dev_most_full_idx, h1, h2);
+
+ bch2_hprint(h1, w.total_work << 9);
+ bch2_hprint(h2, c->capacity << 9);
+ out += scnprintf(out, end - out,
+ "total work:\t\t%s/%s\n",
+ h1, h2);
+
+ out += scnprintf(out, end - out,
+ "rate:\t\t\t%u\n",
+ r->pd.rate.rate);
+
+ switch (r->state) {
+ case REBALANCE_WAITING:
+ out += scnprintf(out, end - out, "waiting\n");
+ break;
+ case REBALANCE_THROTTLED:
+ bch2_hprint(h1,
+ (r->throttled_until_iotime -
+ atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+ out += scnprintf(out, end - out,
+ "throttled for %lu sec or %s io\n",
+ (r->throttled_until_cputime - jiffies) / HZ,
+ h1);
+ break;
+ case REBALANCE_RUNNING:
+ out += scnprintf(out, end - out, "running\n");
+ out += scnprintf(out, end - out, "pos %llu:%llu\n",
+ r->move_stats.iter.pos.inode,
+ r->move_stats.iter.pos.offset);
+ break;
+ }
+
+ return out - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ c->rebalance.pd.rate.rate = UINT_MAX;
+ bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+ p = rcu_dereference_protected(c->rebalance.thread, 1);
+ c->rebalance.thread = NULL;
+
+ if (p) {
+ /* for sychronizing with rebalance_wakeup() */
+ synchronize_rcu();
+
+ kthread_stop(p);
+ put_task_struct(p);
+ }
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ if (c->opts.nochanges)
+ return 0;
+
+ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ get_task_struct(p);
+ rcu_assign_pointer(c->rebalance.thread, p);
+ wake_up_process(p);
+ return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+ bch2_pd_controller_init(&c->rebalance.pd);
+
+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
diff --git a/libbcachefs/tier.h b/libbcachefs/rebalance.h
index 0c66dfea..2e6aa677 100644
--- a/libbcachefs/tier.h
+++ b/libbcachefs/rebalance.h
@@ -1,12 +1,14 @@
-#ifndef _BCACHEFS_TIER_H
-#define _BCACHEFS_TIER_H
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
rcu_read_lock();
- p = rcu_dereference(c->rebalance_thread);
+ p = rcu_dereference(c->rebalance.thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
@@ -16,8 +18,10 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
struct bch_io_opts *);
void bch2_rebalance_add_work(struct bch_fs *, u64);
+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
void bch2_fs_rebalance_init(struct bch_fs *);
-#endif /* _BCACHEFS_TIER_H */
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h
new file mode 100644
index 00000000..aaf5b9ca
--- /dev/null
+++ b/libbcachefs/rebalance_types.h
@@ -0,0 +1,26 @@
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+ REBALANCE_WAITING,
+ REBALANCE_THROTTLED,
+ REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+ struct task_struct __rcu *thread;
+ struct bch_pd_controller pd;
+
+ atomic64_t work_unknown_dev;
+
+ enum rebalance_state state;
+ unsigned long throttled_until_iotime;
+ unsigned long throttled_until_cputime;
+ struct bch_move_stats move_stats;
+
+ unsigned enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index f0ff8d41..afa59a47 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -146,6 +146,8 @@ struct six_lock_waiter {
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+
static inline int six_can_spin_on_owner(struct six_lock *lock)
{
struct task_struct *owner;
@@ -257,6 +259,15 @@ fail:
return false;
}
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+ return false;
+}
+
+#endif
+
noinline
static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
{
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index a2b981a3..9772d597 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -624,7 +624,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
bio->bi_iter.bi_size =
- roundup(vstruct_bytes(sb),
+ roundup((size_t) vstruct_bytes(sb),
bdev_logical_block_size(ca->disk_sb.bdev));
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index f407c205..995b1c90 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -73,11 +73,6 @@ static inline __u64 jset_magic(struct bch_fs *c)
return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
}
-static inline __u64 pset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC);
-}
-
static inline __u64 bset_magic(struct bch_fs *c)
{
return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
@@ -136,4 +131,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
};
}
+size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
+ struct bch_sb_field *);
+
#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 16b8cbfc..55da242c 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -33,11 +33,11 @@
#include "migrate.h"
#include "movinggc.h"
#include "quota.h"
+#include "rebalance.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
-#include "tier.h"
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
@@ -398,10 +398,10 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
-#define BCH_TIME_STAT(name) \
- bch2_time_stats_exit(&c->name##_time);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+ unsigned i;
+
+ for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ bch2_time_stats_exit(&c->times[i]);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
@@ -565,10 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
init_rwsem(&c->gc_lock);
-#define BCH_TIME_STAT(name) \
- bch2_time_stats_init(&c->name##_time);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+ for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ bch2_time_stats_init(&c->times[i]);
bch2_fs_allocator_init(c);
bch2_fs_rebalance_init(c);
@@ -592,14 +590,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
seqcount_init(&c->gc_pos_lock);
c->copy_gc_enabled = 1;
- c->rebalance_enabled = 1;
- c->rebalance_percent = 10;
+ c->rebalance.enabled = 1;
c->promote_whole_extents = true;
- c->journal.write_time = &c->journal_write_time;
- c->journal.delay_time = &c->journal_delay_time;
- c->journal.blocked_time = &c->journal_blocked_time;
- c->journal.flush_seq_time = &c->journal_flush_seq_time;
+ c->journal.write_time = &c->times[BCH_TIME_journal_write];
+ c->journal.delay_time = &c->times[BCH_TIME_journal_delay];
+ c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked];
+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
@@ -647,7 +644,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
- mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+ btree_bytes(c)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 65345d80..5e341a71 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -24,9 +24,9 @@
#include "keylist.h"
#include "move.h"
#include "opts.h"
+#include "rebalance.h"
#include "replicas.h"
#include "super-io.h"
-#include "tier.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@@ -183,8 +183,8 @@ rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
rw_attribute(rebalance_enabled);
-rw_attribute(rebalance_percent);
sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
rw_attribute(promote_whole_extents);
rw_attribute(pd_controllers_update_seconds);
@@ -198,11 +198,11 @@ read_attribute(data_replicas_have);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
-#define BCH_TIME_STAT(_name) \
+#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = S_IRUGO };
BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
static struct attribute sysfs_state_rw = {
.name = "state",
@@ -340,9 +340,11 @@ SHOW(bch2_fs)
sysfs_print(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
- sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
- sysfs_print(rebalance_percent, c->rebalance_percent);
- sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
+
+ if (attr == &sysfs_rebalance_work)
+ return bch2_rebalance_work_show(c, buf);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@@ -404,7 +406,7 @@ STORE(__bch2_fs)
}
if (attr == &sysfs_rebalance_enabled) {
- ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
?: (ssize_t) size;
rebalance_wakeup(c);
@@ -413,9 +415,7 @@ STORE(__bch2_fs)
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
-
- sysfs_strtoul(rebalance_percent, c->rebalance_percent);
- sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
@@ -474,7 +474,6 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
- &sysfs_rebalance_percent,
&sysfs_promote_whole_extents,
&sysfs_compression_stats,
@@ -513,8 +512,11 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_prune_cache,
&sysfs_copy_gc_enabled,
+
&sysfs_rebalance_enabled,
+ &sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
+
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -613,11 +615,12 @@ SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-#define BCH_TIME_STAT(name) \
+#define x(name) \
if (attr == &sysfs_time_stat_##name) \
- return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
+ return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
+ buf, PAGE_SIZE);
BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
return 0;
}
@@ -629,10 +632,10 @@ STORE(bch2_fs_time_stats)
SYSFS_OPS(bch2_fs_time_stats);
struct attribute *bch2_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name) \
+#define x(name) \
&sysfs_time_stat_##name,
BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
NULL
};
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
deleted file mode 100644
index a15a0fa9..00000000
--- a/libbcachefs/tier.c
+++ /dev/null
@@ -1,259 +0,0 @@
-
-#include "bcachefs.h"
-#include "alloc.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "disk_groups.h"
-#include "extents.h"
-#include "io.h"
-#include "move.h"
-#include "super-io.h"
-#include "tier.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-#include <trace/events/bcachefs.h>
-
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
- const struct bch_extent_ptr *ptr,
- struct bch_extent_crc_unpacked crc,
- struct bch_io_opts *io_opts)
-{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
- if (io_opts->background_target &&
- !dev_in_target(ca, io_opts->background_target) &&
- !ptr->cached)
- return true;
-
- if (io_opts->background_compression &&
- crc.compression_type !=
- bch2_compression_opt_to_type[io_opts->background_compression])
- return true;
-
- return false;
-}
-
-void bch2_rebalance_add_key(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
-{
- const struct bch_extent_ptr *ptr;
- struct bch_extent_crc_unpacked crc;
- struct bkey_s_c_extent e;
-
- if (!bkey_extent_is_data(k.k))
- return;
-
- if (!io_opts->background_target &&
- !io_opts->background_compression)
- return;
-
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr_crc(e, ptr, crc)
- if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
- if (!atomic64_add_return(crc.compressed_size,
- &ca->rebalance_work))
- rebalance_wakeup(c);
- }
-}
-
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
-{
- if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
- rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
- enum bkey_type type,
- struct bkey_s_c_extent e,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- const struct bch_extent_ptr *ptr;
- struct bch_extent_crc_unpacked crc;
-
- /* Make sure we have room to add a new pointer: */
- if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
- BKEY_EXTENT_VAL_U64s_MAX)
- return DATA_SKIP;
-
- extent_for_each_ptr_crc(e, ptr, crc)
- if (rebalance_ptr_pred(c, ptr, crc, io_opts))
- goto found;
-
- return DATA_SKIP;
-found:
- data_opts->target = io_opts->background_target;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
-}
-
-struct rebalance_work {
- unsigned dev_most_full_percent;
- u64 dev_most_full_work;
- u64 dev_most_full_capacity;
- u64 total_work;
-};
-
-static struct rebalance_work rebalance_work(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct rebalance_work ret = { 0 };
- unsigned i;
-
- for_each_online_member(ca, c, i) {
- u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket);
- u64 work = atomic64_read(&ca->rebalance_work) +
- atomic64_read(&c->rebalance_work_unknown_dev);
- unsigned percent_full = div_u64(work * 100, capacity);
-
- if (percent_full > ret.dev_most_full_percent) {
- ret.dev_most_full_percent = percent_full;
- ret.dev_most_full_work = work;
- ret.dev_most_full_capacity = capacity;
- }
-
- ret.total_work += atomic64_read(&ca->rebalance_work);
- }
-
- ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
-
- return ret;
-}
-
-static void rebalance_work_reset(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i)
- atomic64_set(&ca->rebalance_work, 0);
-
- atomic64_set(&c->rebalance_work_unknown_dev, 0);
-}
-
-static unsigned long curr_cputime(void)
-{
- u64 utime, stime;
-
- task_cputime_adjusted(current, &utime, &stime);
- return nsecs_to_jiffies(utime + stime);
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
- struct bch_fs *c = arg;
- struct io_clock *clock = &c->io_clock[WRITE];
- struct rebalance_work w, p;
- unsigned long start, prev_start;
- unsigned long prev_run_time, prev_run_cputime;
- unsigned long cputime, prev_cputime;
-
- set_freezable();
-
- p = rebalance_work(c);
- prev_start = jiffies;
- prev_cputime = curr_cputime();
-
- while (!kthread_wait_freezable(c->rebalance_enabled)) {
- struct bch_move_stats move_stats = { 0 };
-
- w = rebalance_work(c);
- start = jiffies;
- cputime = curr_cputime();
-
- prev_run_time = start - prev_start;
- prev_run_cputime = cputime - prev_cputime;
-
- if (!w.total_work) {
- kthread_wait_freezable(rebalance_work(c).total_work);
- continue;
- }
-
- if (w.dev_most_full_percent < 20 &&
- prev_run_cputime * 5 > prev_run_time) {
- if (w.dev_most_full_capacity) {
- bch2_kthread_io_clock_wait(clock,
- atomic_long_read(&clock->now) +
- div_u64(w.dev_most_full_capacity, 5));
- } else {
-
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
-
- schedule_timeout(prev_run_cputime * 5 -
- prev_run_time);
- continue;
- }
- }
-
- /* minimum 1 mb/sec: */
- c->rebalance_pd.rate.rate =
- max_t(u64, 1 << 11,
- c->rebalance_pd.rate.rate *
- max(p.dev_most_full_percent, 1U) /
- max(w.dev_most_full_percent, 1U));
-
- rebalance_work_reset(c);
-
- bch2_move_data(c, &c->rebalance_pd.rate,
- writepoint_ptr(&c->rebalance_write_point),
- POS_MIN, POS_MAX,
- rebalance_pred, NULL,
- &move_stats);
- }
-
- return 0;
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
- struct task_struct *p;
-
- c->rebalance_pd.rate.rate = UINT_MAX;
- bch2_ratelimit_reset(&c->rebalance_pd.rate);
-
- p = c->rebalance_thread;
- c->rebalance_thread = NULL;
-
- if (p) {
- /* for sychronizing with rebalance_wakeup() */
- synchronize_rcu();
-
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
- struct task_struct *p;
-
- if (c->opts.nochanges)
- return 0;
-
- p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
- if (IS_ERR(p))
- return PTR_ERR(p);
-
- get_task_struct(p);
-
- rcu_assign_pointer(c->rebalance_thread, p);
- wake_up_process(c->rebalance_thread);
- return 0;
-}
-
-void bch2_fs_rebalance_init(struct bch_fs *c)
-{
- bch2_pd_controller_init(&c->rebalance_pd);
-
- atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
-}
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 1f2c23b9..60e1f1ff 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -203,7 +203,7 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_quantiles_update(struct quantiles *q, u64 v)
+static void bch2_quantiles_update(struct quantiles *q, u64 v)
{
unsigned i = 0;
@@ -569,6 +569,23 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
+int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+ int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ bv->bv_page = alloc_page(gfp_mask);
+ if (!bv->bv_page) {
+ while (--bv >= bio->bi_io_vec)
+ __free_page(bv->bv_page);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
size_t bch2_rand_range(size_t max)
{
size_t rand;
@@ -771,20 +788,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
}
}
-void mempool_free_vp(void *element, void *pool_data)
+static void mempool_free_vp(void *element, void *pool_data)
{
size_t size = (size_t) pool_data;
vpfree(element, size);
}
-void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t) pool_data;
return vpmalloc(size, gfp_mask);
}
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+ return size < PAGE_SIZE
+ ? mempool_init_kmalloc_pool(pool, min_nr, size)
+ : mempool_init(pool, min_nr, mempool_alloc_vp,
+ mempool_free_vp, (void *) size);
+}
+
#if 0
void eytzinger1_test(void)
{
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 7c7264f4..18491559 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -68,9 +68,9 @@ struct closure;
#define __flatten
#endif
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define CPU_BIG_ENDIAN 0
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define CPU_BIG_ENDIAN 1
#endif
@@ -113,14 +113,7 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
: vpmalloc(size, gfp_mask);
}
-void mempool_free_vp(void *element, void *pool_data);
-void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
-
-static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return mempool_init(pool, min_nr, mempool_alloc_vp,
- mempool_free_vp, (void *) size);
-}
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
#define HEAP(type) \
struct { \
@@ -610,6 +603,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch2_bio_map(struct bio *bio, void *base);
+int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 79a98f75..c89c7200 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -5,8 +5,8 @@
#include "compress.h"
#include "extents.h"
#include "fs.h"
+#include "rebalance.h"
#include "str_hash.h"
-#include "tier.h"
#include "xattr.h"
#include <linux/dcache.h>
diff --git a/linux/sched.c b/linux/sched.c
index 2d61c480..de6eb142 100644
--- a/linux/sched.c
+++ b/linux/sched.c
@@ -40,14 +40,22 @@ void schedule(void)
v, NULL, NULL, 0);
}
-static void process_timeout(unsigned long __data)
+struct process_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
{
- wake_up_process((struct task_struct *)__data);
+ struct process_timer *timeout =
+ container_of(t, struct process_timer, timer);
+
+ wake_up_process(timeout->task);
}
long schedule_timeout(long timeout)
{
- struct timer_list timer;
+ struct process_timer timer;
unsigned long expire;
switch (timeout)
@@ -80,10 +88,11 @@ long schedule_timeout(long timeout)
expire = timeout + jiffies;
- setup_timer(&timer, process_timeout, (unsigned long)current);
- mod_timer(&timer, expire);
+ timer.task = current;
+ timer_setup_on_stack(&timer.timer, process_timeout, 0);
+ mod_timer(&timer.timer, expire);
schedule();
- del_timer_sync(&timer);
+ del_timer_sync(&timer.timer);
timeout = expire - jiffies;
out:
diff --git a/linux/timer.c b/linux/timer.c
index b67a54ac..dd5aba18 100644
--- a/linux/timer.c
+++ b/linux/timer.c
@@ -273,7 +273,7 @@ static int timer_thread(void *arg)
BUG_ON(!timer_running());
pthread_mutex_unlock(&timer_lock);
- timer->function(timer->data);
+ timer->function(timer);
pthread_mutex_lock(&timer_lock);
timer_seq++;
diff --git a/linux/workqueue.c b/linux/workqueue.c
index f5942772..4dfd6cd9 100644
--- a/linux/workqueue.c
+++ b/linux/workqueue.c
@@ -55,9 +55,10 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
return ret;
}
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *timer)
{
- struct delayed_work *dwork = (struct delayed_work *) __data;
+ struct delayed_work *dwork =
+ container_of(timer, struct delayed_work, timer);
pthread_mutex_lock(&wq_lock);
__queue_work(dwork->wq, &dwork->work);
@@ -71,8 +72,7 @@ static void __queue_delayed_work(struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
- BUG_ON(timer->function != delayed_work_timer_fn ||
- timer->data != (unsigned long)dwork);
+ BUG_ON(timer->function != delayed_work_timer_fn);
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));