summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--libbcachefs/bkey.c26
-rw-r--r--libbcachefs/bkey_sort.c8
-rw-r--r--libbcachefs/bset.c354
-rw-r--r--libbcachefs/bset.h4
-rw-r--r--libbcachefs/btree_cache.c8
-rw-r--r--libbcachefs/btree_iter.c7
-rw-r--r--libbcachefs/buckets.c26
-rw-r--r--libbcachefs/clock.c7
-rw-r--r--libbcachefs/clock.h13
-rw-r--r--libbcachefs/error.c13
-rw-r--r--libbcachefs/error.h1
-rw-r--r--libbcachefs/extents.c35
-rw-r--r--libbcachefs/fs-io.c173
-rw-r--r--libbcachefs/fs-io.h4
-rw-r--r--libbcachefs/fs.c59
-rw-r--r--libbcachefs/fs.h37
-rw-r--r--libbcachefs/fsck.c2
-rw-r--r--libbcachefs/io.c24
-rw-r--r--libbcachefs/opts.h11
-rw-r--r--libbcachefs/reflink.c4
21 files changed, 426 insertions, 392 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 9676940a..e0172a41 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-9e76e8d98c52c128641b0f916a1990a37d60d22e
+b1a4dc53be10a4c3132fccaaf604d73861a52d2d
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index ed7ca5b0..4d0c9129 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -1058,26 +1058,20 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
const struct bkey_packed *r,
const struct btree *b)
{
- int packed = bkey_lr_packed(l, r);
+ struct bkey unpacked;
- if (likely(packed == BKEY_PACKED_BOTH))
+ if (likely(bkey_packed(l) && bkey_packed(r)))
return __bch2_bkey_cmp_packed_format_checked(l, r, b);
- switch (packed) {
- case BKEY_PACKED_NONE:
- return bkey_cmp(((struct bkey *) l)->p,
- ((struct bkey *) r)->p);
- case BKEY_PACKED_LEFT:
- return __bch2_bkey_cmp_left_packed_format_checked(b,
- (struct bkey_packed *) l,
- &((struct bkey *) r)->p);
- case BKEY_PACKED_RIGHT:
- return -__bch2_bkey_cmp_left_packed_format_checked(b,
- (struct bkey_packed *) r,
- &((struct bkey *) l)->p);
- default:
- unreachable();
+ if (bkey_packed(l)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, l);
+ l = (void*) &unpacked;
+ } else if (bkey_packed(r)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, r);
+ r = (void*) &unpacked;
}
+
+ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
}
__pure __flatten
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
index e32fad5a..2cac269b 100644
--- a/libbcachefs/bkey_sort.c
+++ b/libbcachefs/bkey_sort.c
@@ -418,7 +418,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
struct bkey_packed *prev = NULL, *k_packed;
struct bkey_s k;
struct btree_nr_keys nr;
- BKEY_PADDED(k) tmp;
+ struct bkey unpacked;
memset(&nr, 0, sizeof(nr));
@@ -426,11 +426,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
if (filter_whiteouts && bkey_whiteout(k_packed))
continue;
- EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) >
- BKEY_EXTENT_VAL_U64s_MAX);
-
- bch2_bkey_unpack(src, &tmp.k, k_packed);
- k = bkey_i_to_s(&tmp.k);
+ k = __bkey_disassemble(src, k_packed, &unpacked);
if (filter_whiteouts &&
bch2_bkey_normalize(c, k))
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 19f13b7e..b7618e2b 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -294,38 +294,23 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
/* Auxiliary search trees */
-#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0)
-#define BFLOAT_FAILED_PREV (U8_MAX - 1)
-#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2)
-#define BFLOAT_FAILED (U8_MAX - 2)
-
-#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+#define BFLOAT_FAILED_UNPACKED U8_MAX
+#define BFLOAT_FAILED U8_MAX
struct bkey_float {
u8 exponent;
u8 key_offset;
- union {
- u32 mantissa32;
- struct {
- u16 mantissa16;
- u16 _pad;
- };
- };
-} __packed;
-
-#define BFLOAT_32BIT_NR 32U
+ u16 mantissa;
+};
+#define BKEY_MANTISSA_BITS 16
static unsigned bkey_float_byte_offset(unsigned idx)
{
- int d = (idx - BFLOAT_32BIT_NR) << 1;
-
- d &= ~(d >> 31);
-
- return idx * 6 - d;
+ return idx * sizeof(struct bkey_float);
}
struct ro_aux_tree {
- struct bkey_float _d[0];
+ struct bkey_float f[0];
};
struct rw_aux_tree {
@@ -380,8 +365,8 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
return t->aux_data_offset;
case BSET_RO_AUX_TREE:
return t->aux_data_offset +
- DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
- sizeof(u8) * t->size, 8);
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
+ t->size * sizeof(u8), 8);
case BSET_RW_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -420,17 +405,11 @@ static u8 *ro_aux_tree_prev(const struct btree *b,
return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
}
-static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
- unsigned idx)
-{
- return (void *) b + bkey_float_byte_offset(idx);
-}
-
static struct bkey_float *bkey_float(const struct btree *b,
const struct bset_tree *t,
unsigned idx)
{
- return bkey_float_get(ro_aux_tree_base(b, t), idx);
+ return ro_aux_tree_base(b, t)->f + idx;
}
static void bset_aux_tree_verify(struct btree *b)
@@ -669,21 +648,6 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
return idx;
}
-static inline unsigned bfloat_mantissa(const struct bkey_float *f,
- unsigned idx)
-{
- return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
-}
-
-static inline void bfloat_mantissa_set(struct bkey_float *f,
- unsigned idx, unsigned mantissa)
-{
- if (idx < BFLOAT_32BIT_NR)
- f->mantissa32 = mantissa;
- else
- f->mantissa16 = mantissa;
-}
-
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
const struct bkey_float *f,
unsigned idx)
@@ -703,9 +667,9 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
v >>= f->exponent & 7;
#else
- v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
#endif
- return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+ return (u16) v;
}
static void make_bfloat(struct btree *b, struct bset_tree *t,
@@ -715,14 +679,10 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
struct bkey_packed *l, *r;
- unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
unsigned mantissa;
int shift, exponent, high_bit;
- EBUG_ON(bkey_next(p) != m);
-
if (is_power_of_2(j)) {
l = min_key;
@@ -764,8 +724,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
* the original key.
*/
- if (!bkey_packed(l) || !bkey_packed(r) ||
- !bkey_packed(p) || !bkey_packed(m) ||
+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
!b->nr_key_bits) {
f->exponent = BFLOAT_FAILED_UNPACKED;
return;
@@ -782,8 +741,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
* of the key: we handle this later:
*/
high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
- min_t(unsigned, bits, b->nr_key_bits) - 1);
- exponent = high_bit - (bits - 1);
+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
/*
* Then we calculate the actual shift value, from the start of the key
@@ -792,12 +751,12 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
- EBUG_ON(shift + bits > b->format.key_u64s * 64);
+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
#else
shift = high_bit_offset +
b->nr_key_bits -
exponent -
- bits;
+ BKEY_MANTISSA_BITS;
EBUG_ON(shift < KEY_PACKED_BITS_START);
#endif
@@ -813,37 +772,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
if (exponent < 0)
mantissa |= ~(~0U << -exponent);
- bfloat_mantissa_set(f, j, mantissa);
-
- /*
- * The bfloat must be able to tell its key apart from the previous key -
- * if its key and the previous key don't differ in the required bits,
- * flag as failed - unless the keys are actually equal, in which case
- * we aren't required to return a specific one:
- */
- if (exponent > 0 &&
- bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
- bkey_cmp_packed(b, p, m)) {
- f->exponent = BFLOAT_FAILED_PREV;
- return;
- }
-
- /*
- * f->mantissa must compare >= the original key - for transitivity with
- * the comparison in bset_search_tree. If we're dropping set bits,
- * increment it:
- */
- if (exponent > (int) bch2_bkey_ffs(b, m)) {
- if (j < BFLOAT_32BIT_NR
- ? f->mantissa32 == U32_MAX
- : f->mantissa16 == U16_MAX)
- f->exponent = BFLOAT_FAILED_OVERFLOW;
-
- if (j < BFLOAT_32BIT_NR)
- f->mantissa32++;
- else
- f->mantissa16++;
- }
+ f->mantissa = mantissa;
}
/* bytes remaining - only valid for last bset: */
@@ -856,14 +785,8 @@ static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
{
- unsigned bytes = __bset_tree_capacity(b, t);
-
- if (bytes < 7 * BFLOAT_32BIT_NR)
- return bytes / 7;
-
- bytes -= 7 * BFLOAT_32BIT_NR;
-
- return BFLOAT_32BIT_NR + bytes / 5;
+ return __bset_tree_capacity(b, t) /
+ (sizeof(struct bkey_float) + sizeof(u8));
}
static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
@@ -1333,14 +1256,38 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
return rw_aux_to_bkey(b, t, l);
}
-noinline
-static int bset_search_tree_slowpath(const struct btree *b,
- struct bset_tree *t, struct bpos *search,
- const struct bkey_packed *packed_search,
- unsigned n)
+static inline void prefetch_four_cachelines(void *p)
+{
+#ifdef CONFIG_X86_64
+ asm(".intel_syntax noprefix;"
+ "prefetcht0 [%0 - 127 + 64 * 0];"
+ "prefetcht0 [%0 - 127 + 64 * 1];"
+ "prefetcht0 [%0 - 127 + 64 * 2];"
+ "prefetcht0 [%0 - 127 + 64 * 3];"
+ ".att_syntax prefix;"
+ :
+ : "r" (p + 127));
+#else
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+ const struct bkey_float *f,
+ unsigned idx)
{
- return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
- packed_search, search) < 0;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+ return f->exponent > key_bits_start;
+#else
+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+
+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
+#endif
}
__flatten
@@ -1350,44 +1297,37 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
const struct bkey_packed *packed_search)
{
struct ro_aux_tree *base = ro_aux_tree_base(b, t);
- struct bkey_float *f = bkey_float_get(base, 1);
- void *p;
- unsigned inorder, n = 1;
+ struct bkey_float *f;
+ struct bkey_packed *k;
+ unsigned inorder, n = 1, l, r;
+ int cmp;
- while (1) {
- if (likely(n << 4 < t->size)) {
- p = bkey_float_get(base, n << 4);
- prefetch(p);
- } else if (n << 3 < t->size) {
- inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
- p = bset_cacheline(b, t, inorder);
-#ifdef CONFIG_X86_64
- asm(".intel_syntax noprefix;"
- "prefetcht0 [%0 - 127 + 64 * 0];"
- "prefetcht0 [%0 - 127 + 64 * 1];"
- "prefetcht0 [%0 - 127 + 64 * 2];"
- "prefetcht0 [%0 - 127 + 64 * 3];"
- ".att_syntax prefix;"
- :
- : "r" (p + 127));
-#else
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- prefetch(p + L1_CACHE_BYTES * 3);
-#endif
- } else if (n >= t->size)
- break;
+ do {
+ if (likely(n << 4 < t->size))
+ prefetch(&base->f[n << 4]);
- f = bkey_float_get(base, n);
+ f = &base->f[n];
- if (packed_search &&
- likely(f->exponent < BFLOAT_FAILED))
- n = n * 2 + (bfloat_mantissa(f, n) <
- bkey_mantissa(packed_search, f, n));
- else
- n = n * 2 + bset_search_tree_slowpath(b, t,
- search, packed_search, n);
+ if (!unlikely(packed_search))
+ goto slowpath;
+ if (unlikely(f->exponent >= BFLOAT_FAILED))
+ goto slowpath;
+
+ l = f->mantissa;
+ r = bkey_mantissa(packed_search, f, n);
+
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+ goto slowpath;
+
+ n = n * 2 + (l < r);
+ continue;
+slowpath:
+ k = tree_to_bkey(b, t, n);
+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+ if (!cmp)
+ return k;
+
+ n = n * 2 + (cmp < 0);
} while (n < t->size);
inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
@@ -1396,29 +1336,23 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
* n would have been the node we recursed to - the low bit tells us if
* we recursed left or recursed right.
*/
- if (n & 1) {
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
- } else {
- if (--inorder) {
- n = eytzinger1_prev(n >> 1, t->size);
- f = bkey_float_get(base, n);
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
- } else
+ if (likely(!(n & 1))) {
+ --inorder;
+ if (unlikely(!inorder))
return btree_bkey_first(b, t);
+
+ f = &base->f[eytzinger1_prev(n >> 1, t->size)];
}
+
+ return cacheline_to_bkey(b, t, inorder, f->key_offset);
}
-/*
- * Returns the first key greater than or equal to @search
- */
-__always_inline __flatten
-static struct bkey_packed *bch2_bset_search(struct btree *b,
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
struct bset_tree *t,
struct bpos *search,
- struct bkey_packed *packed_search,
const struct bkey_packed *lossy_packed_search)
{
- struct bkey_packed *m;
/*
* First, we search for a cacheline, then lastly we do a linear search
@@ -1437,11 +1371,9 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
switch (bset_aux_tree_type(t)) {
case BSET_NO_AUX_TREE:
- m = btree_bkey_first(b, t);
- break;
+ return btree_bkey_first(b, t);
case BSET_RW_AUX_TREE:
- m = bset_search_write_set(b, t, search, lossy_packed_search);
- break;
+ return bset_search_write_set(b, t, search, lossy_packed_search);
case BSET_RO_AUX_TREE:
/*
* Each node in the auxiliary search tree covers a certain range
@@ -1453,10 +1385,20 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
if (bkey_cmp(*search, t->max_key) > 0)
return btree_bkey_last(b, t);
- m = bset_search_tree(b, t, search, lossy_packed_search);
- break;
+ return bset_search_tree(b, t, search, lossy_packed_search);
+ default:
+ unreachable();
}
+}
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search,
+ struct bkey_packed *packed_search,
+ const struct bkey_packed *lossy_packed_search,
+ struct bkey_packed *m)
+{
if (lossy_packed_search)
while (m != btree_bkey_last(b, t) &&
bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
@@ -1479,6 +1421,23 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
return m;
}
+/*
+ * Returns the first key greater than or equal to @search
+ */
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search(struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search,
+ struct bkey_packed *packed_search,
+ const struct bkey_packed *lossy_packed_search)
+{
+ struct bkey_packed *m = __bch2_bset_search(b, t, search,
+ lossy_packed_search);
+
+ return bch2_bset_search_linear(b, t, search,
+ packed_search, lossy_packed_search, m);
+}
+
/* Btree node iterator */
static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
@@ -1569,9 +1528,10 @@ __flatten
void bch2_btree_node_iter_init(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
- struct bset_tree *t;
struct bkey_packed p, *packed_search = NULL;
struct btree_node_iter_set *pos = iter->data;
+ struct bkey_packed *k[MAX_BSETS];
+ unsigned i;
EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
bset_aux_tree_verify(b);
@@ -1590,14 +1550,20 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
return;
}
- for_each_bset(b, t) {
- struct bkey_packed *k = bch2_bset_search(b, t, search,
- packed_search, &p);
+ for (i = 0; i < b->nsets; i++) {
+ k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+ prefetch_four_cachelines(k[i]);
+ }
+
+ for (i = 0; i < b->nsets; i++) {
+ struct bset_tree *t = b->set + i;
struct bkey_packed *end = btree_bkey_last(b, t);
- if (k != end)
+ k[i] = bch2_bset_search_linear(b, t, search,
+ packed_search, &p, k[i]);
+ if (k[i] != end)
*pos++ = (struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k),
+ __btree_node_key_to_offset(b, k[i]),
__btree_node_key_to_offset(b, end)
};
}
@@ -1794,17 +1760,9 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
stats->floats += t->size - 1;
for (j = 1; j < t->size; j++)
- switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED_UNPACKED:
- stats->failed_unpacked++;
- break;
- case BFLOAT_FAILED_PREV:
- stats->failed_prev++;
- break;
- case BFLOAT_FAILED_OVERFLOW:
- stats->failed_overflow++;
- break;
- }
+ stats->failed +=
+ bkey_float(b, t, j)->exponent ==
+ BFLOAT_FAILED;
}
}
}
@@ -1813,9 +1771,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
struct bkey_packed *k)
{
struct bset_tree *t = bch2_bkey_to_bset(b, k);
- struct bkey_packed *l, *r, *p;
- struct bkey uk, up;
- char buf1[200], buf2[200];
+ struct bkey uk;
unsigned j, inorder;
if (out->pos != out->end)
@@ -1833,7 +1789,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
return;
switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED_UNPACKED:
+ case BFLOAT_FAILED:
uk = bkey_unpack_key(b, k);
pr_buf(out,
" failed unpacked at depth %u\n"
@@ -1841,41 +1797,5 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
ilog2(j),
uk.p.inode, uk.p.offset);
break;
- case BFLOAT_FAILED_PREV:
- p = tree_to_prev_bkey(b, t, j);
- l = is_power_of_2(j)
- ? btree_bkey_first(b, t)
- : tree_to_prev_bkey(b, t, j >> ffs(j));
- r = is_power_of_2(j + 1)
- ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
- : tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- up = bkey_unpack_key(b, p);
- uk = bkey_unpack_key(b, k);
- bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
- bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
-
- pr_buf(out,
- " failed prev at depth %u\n"
- "\tkey starts at bit %u but first differing bit at %u\n"
- "\t%llu:%llu\n"
- "\t%llu:%llu\n"
- "\t%s\n"
- "\t%s\n",
- ilog2(j),
- bch2_bkey_greatest_differing_bit(b, l, r),
- bch2_bkey_greatest_differing_bit(b, p, k),
- uk.p.inode, uk.p.offset,
- up.p.inode, up.p.offset,
- buf1, buf2);
- break;
- case BFLOAT_FAILED_OVERFLOW:
- uk = bkey_unpack_key(b, k);
- pr_buf(out,
- " failed overflow at depth %u\n"
- "\t%llu:%llu\n",
- ilog2(j),
- uk.p.inode, uk.p.offset);
- break;
}
}
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index 643bd9e8..ccc0866d 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -582,9 +582,7 @@ struct bset_stats {
} sets[BSET_TREE_NR_TYPES];
size_t floats;
- size_t failed_unpacked;
- size_t failed_prev;
- size_t failed_overflow;
+ size_t failed;
};
void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 41694951..5d3acba5 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -909,9 +909,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
- " failed unpacked %zu\n"
- " failed prev %zu\n"
- " failed overflow %zu\n",
+ " failed unpacked %zu\n",
f->key_u64s,
f->bits_per_field[0],
f->bits_per_field[1],
@@ -928,7 +926,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
b->nr.packed_keys,
b->nr.unpacked_keys,
stats.floats,
- stats.failed_unpacked,
- stats.failed_prev,
- stats.failed_overflow);
+ stats.failed);
}
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 5d4a2cb8..a4180124 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1096,7 +1096,12 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
- if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+ /*
+ * if we need interior nodes locked, call btree_iter_relock() to make
+ * sure we walk back up enough that we lock them:
+ */
+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
+ iter->locks_want > 1)
bch2_btree_iter_relock(iter, false);
if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index c4183982..8d223aa2 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1464,7 +1464,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
- unsigned old;
+ u16 *dst_sectors;
bool overflow;
int ret;
@@ -1519,22 +1519,24 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
goto out;
}
- if (!p.ptr.cached) {
- old = u.dirty_sectors;
- overflow = checked_add(u.dirty_sectors, sectors);
- } else {
- old = u.cached_sectors;
- overflow = checked_add(u.cached_sectors, sectors);
+ dst_sectors = !p.ptr.cached
+ ? &u.dirty_sectors
+ : &u.cached_sectors;
+
+ overflow = checked_add(*dst_sectors, sectors);
+
+ if (overflow) {
+ bch2_fs_inconsistent(c,
+ "bucket sector count overflow: %u + %lli > U16_MAX",
+ *dst_sectors, sectors);
+ /* return an error indicating that we need full fsck */
+ ret = -EIO;
+ goto out;
}
u.data_type = u.dirty_sectors || u.cached_sectors
? data_type : 0;
- bch2_fs_inconsistent_on(overflow, c,
- "bucket sector count overflow: %u + %lli > U16_MAX",
- old, sectors);
- BUG_ON(overflow);
-
a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index 8ac6990c..f1826633 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -135,17 +135,16 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
return ret;
}
-void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+void __bch2_increment_clock(struct io_clock *clock)
{
- struct io_clock *clock = &c->io_clock[rw];
struct io_timer *timer;
unsigned long now;
+ unsigned sectors;
/* Buffer up one megabyte worth of IO in the percpu counter */
preempt_disable();
- if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
- IO_CLOCK_PCPU_SECTORS)) {
+ if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) {
preempt_enable();
return;
}
diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h
index 5cb043c5..bfbbca8a 100644
--- a/libbcachefs/clock.h
+++ b/libbcachefs/clock.h
@@ -6,7 +6,18 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
unsigned long);
-void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void __bch2_increment_clock(struct io_clock *);
+
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+ int rw)
+{
+ struct io_clock *clock = &c->io_clock[rw];
+
+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+ IO_CLOCK_PCPU_SECTORS))
+ __bch2_increment_clock(clock);
+}
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 304ff925..5a5cfee6 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -64,7 +64,7 @@ void bch2_io_error(struct bch_dev *ca)
enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
const char *fmt, ...)
{
- struct fsck_err_state *s;
+ struct fsck_err_state *s = NULL;
va_list args;
bool fix = false, print = true, suppressing = false;
char _buf[sizeof(s->buf)], *buf = _buf;
@@ -99,8 +99,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
found:
list_move(&s->list, &c->fsck_errors);
s->nr++;
- suppressing = s->nr == FSCK_ERR_RATELIMIT_NR;
- print = s->nr <= FSCK_ERR_RATELIMIT_NR;
+ if (c->opts.ratelimit_errors &&
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
+ suppressing = true;
+ else
+ print = false;
+ }
buf = s->buf;
print:
va_start(args, fmt);
@@ -156,7 +161,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_lock(&c->fsck_error_lock);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
- if (s->nr > FSCK_ERR_RATELIMIT_NR)
+ if (s->ratelimited)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
list_del(&s->list);
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 2591e123..7dcb0f65 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -114,6 +114,7 @@ struct fsck_err_state {
struct list_head list;
const char *fmt;
u64 nr;
+ bool ratelimited;
char buf[512];
};
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 4cc2a4b1..b9c69792 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -1218,7 +1218,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
struct bkey_i whiteout = *insert;
struct bkey_packed *_k;
struct bkey unpacked;
- BKEY_PADDED(k) tmp;
EBUG_ON(iter->level);
EBUG_ON(!insert->k.size);
@@ -1292,25 +1291,23 @@ next:
bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
if (update_btree) {
- bkey_copy(&tmp.k, insert);
-
if (deleting)
- tmp.k.k.type = KEY_TYPE_discard;
+ insert->k.type = KEY_TYPE_discard;
- EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- extent_bset_insert(c, iter, &tmp.k);
+ extent_bset_insert(c, iter, insert);
}
if (update_journal) {
- bkey_copy(&tmp.k, !deleting ? insert : &whiteout);
+ struct bkey_i *k = !deleting ? insert : &whiteout;
if (deleting)
- tmp.k.k.type = KEY_TYPE_discard;
+ k->k.type = KEY_TYPE_discard;
- EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+ EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
- bch2_btree_journal_key(trans, iter, &tmp.k);
+ bch2_btree_journal_key(trans, iter, k);
}
bch2_cut_front(insert->k.p, insert);
@@ -1390,16 +1387,18 @@ static unsigned bch2_crc_field_size_max[] = {
};
static void bch2_extent_crc_pack(union bch_extent_crc *dst,
- struct bch_extent_crc_unpacked src)
+ struct bch_extent_crc_unpacked src,
+ enum bch_extent_entry_type type)
{
#define set_common_fields(_dst, _src) \
+ _dst.type = 1 << type; \
_dst.csum_type = _src.csum_type, \
_dst.compression_type = _src.compression_type, \
_dst._compressed_size = _src.compressed_size - 1, \
_dst._uncompressed_size = _src.uncompressed_size - 1, \
_dst.offset = _src.offset
- switch (extent_entry_type(to_entry(dst))) {
+ switch (type) {
case BCH_EXTENT_ENTRY_crc32:
set_common_fields(dst->crc32, src);
dst->crc32.csum = *((__le32 *) &src.csum.lo);
@@ -1426,23 +1425,24 @@ void bch2_extent_crc_append(struct bkey_i *k,
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
union bch_extent_crc *crc = (void *) ptrs.end;
+ enum bch_extent_entry_type type;
if (bch_crc_bytes[new.csum_type] <= 4 &&
new.uncompressed_size - 1 <= CRC32_SIZE_MAX &&
new.nonce <= CRC32_NONCE_MAX)
- crc->type = 1 << BCH_EXTENT_ENTRY_crc32;
+ type = BCH_EXTENT_ENTRY_crc32;
else if (bch_crc_bytes[new.csum_type] <= 10 &&
new.uncompressed_size - 1 <= CRC64_SIZE_MAX &&
new.nonce <= CRC64_NONCE_MAX)
- crc->type = 1 << BCH_EXTENT_ENTRY_crc64;
+ type = BCH_EXTENT_ENTRY_crc64;
else if (bch_crc_bytes[new.csum_type] <= 16 &&
new.uncompressed_size - 1 <= CRC128_SIZE_MAX &&
new.nonce <= CRC128_NONCE_MAX)
- crc->type = 1 << BCH_EXTENT_ENTRY_crc128;
+ type = BCH_EXTENT_ENTRY_crc128;
else
BUG();
- bch2_extent_crc_pack(crc, new);
+ bch2_extent_crc_pack(crc, new, type);
k->k.u64s += extent_entry_u64s(ptrs.end);
@@ -1645,7 +1645,8 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
crc_l.uncompressed_size += crc_r.uncompressed_size;
crc_l.compressed_size += crc_r.compressed_size;
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l);
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+ extent_entry_type(en_l));
}
bch2_key_resize(l.k, l.k->size + r.k->size);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 90a9bfa4..fd6eb00e 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -507,12 +507,25 @@ static void bch2_set_page_dirty(struct bch_fs *c,
__set_page_dirty_nobuffers(page);
}
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+ struct file *file = vmf->vma->vm_file;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ int ret;
+
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ ret = filemap_fault(vmf);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+ return ret;
+}
+
vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct file *file = vmf->vma->vm_file;
struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = inode->v.i_mapping;
+ struct address_space *mapping = file->f_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_page_reservation res;
unsigned len;
@@ -530,8 +543,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
* a write_invalidate_inode_pages_range() that works without dropping
* page lock before invalidating page
*/
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
lock_page(page);
isize = i_size_read(&inode->v);
@@ -551,14 +563,13 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
}
bch2_set_page_dirty(c, inode, page, &res, 0, len);
+ bch2_page_reservation_put(c, inode, &res);
+
wait_for_stable_page(page);
out:
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
sb_end_pagefault(inode->v.i_sb);
- bch2_page_reservation_put(c, inode, &res);
-
return ret;
}
@@ -888,8 +899,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_SLOTS);
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
while ((page = readpage_iter_next(&readpages_iter))) {
pgoff_t index = readpages_iter.offset + readpages_iter.idx;
@@ -912,8 +922,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
&readpages_iter);
}
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
@@ -1294,8 +1303,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_page_reservation_init(c, inode, res);
*fsdata = res;
- /* Not strictly necessary - same reason as mkwrite(): */
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -1347,7 +1355,7 @@ err:
put_page(page);
*pagep = NULL;
err_unlock:
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
kfree(res);
*fsdata = NULL;
return ret;
@@ -1391,7 +1399,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_page_reservation_put(c, inode, res);
kfree(res);
@@ -1549,7 +1557,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
ssize_t written = 0;
int ret = 0;
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
do {
unsigned offset = pos & (PAGE_SIZE - 1);
@@ -1606,7 +1614,7 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
return written ? written : ret;
}
@@ -1730,6 +1738,43 @@ start:
}
}
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct address_space *mapping = file->f_mapping;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ if (!count)
+ return 0; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct blk_plug plug;
+
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ return ret;
+
+ file_accessed(file);
+
+ blk_start_plug(&plug);
+ ret = bch2_direct_IO_read(iocb, iter);
+ blk_finish_plug(&plug);
+
+ if (ret >= 0)
+ iocb->ki_pos += ret;
+ } else {
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ ret = generic_file_read_iter(iocb, iter);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ }
+
+ return ret;
+}
+
/* O_DIRECT writes */
static long bch2_dio_write_loop(struct dio_write *dio)
@@ -1744,34 +1789,23 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bio_vec *bv;
unsigned unaligned;
u64 new_i_size;
- loff_t offset;
bool sync;
long ret;
if (dio->loop)
goto loop;
- /* Write and invalidate pagecache range that we're writing to: */
- offset = req->ki_pos + (dio->op.written << 9);
- ret = write_invalidate_inode_pages_range(mapping,
- offset,
- offset + iov_iter_count(&dio->iter) - 1);
- if (unlikely(ret))
- goto err;
-
while (1) {
- offset = req->ki_pos + (dio->op.written << 9);
-
- BUG_ON(current->pagecache_lock);
- current->pagecache_lock = &mapping->add_lock;
if (kthread)
use_mm(dio->mm);
+ BUG_ON(current->faults_disabled_mapping);
+ current->faults_disabled_mapping = mapping;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
+ current->faults_disabled_mapping = NULL;
if (kthread)
unuse_mm(dio->mm);
- current->pagecache_lock = NULL;
if (unlikely(ret < 0))
goto err;
@@ -1791,14 +1825,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
goto err;
}
- /* gup might have faulted pages back in: */
- ret = write_invalidate_inode_pages_range(mapping,
- offset,
- offset + bio->bi_iter.bi_size - 1);
- if (unlikely(ret))
- goto err;
-
- dio->op.pos = POS(inode->v.i_ino, offset >> 9);
+ dio->op.pos = POS(inode->v.i_ino,
+ (req->ki_pos >> 9) + dio->op.written);
task_io_account_write(bio->bi_iter.bi_size);
@@ -1850,7 +1878,7 @@ loop:
ret = dio->op.error ?: ((long) dio->op.written << 9);
err:
- __pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_disk_reservation_put(c, &dio->op.res);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
@@ -1916,7 +1944,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
goto err;
inode_dio_begin(&inode->v);
- __pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
extending = req->ki_pos + iter->count > inode->v.i_size;
if (!extending) {
@@ -1964,6 +1992,12 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
dio->op.opts.data_replicas))
goto err_put_bio;
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter->count - 1);
+ if (unlikely(ret))
+ goto err_put_bio;
+
ret = bch2_dio_write_loop(dio);
err:
if (locked)
@@ -1972,7 +2006,7 @@ err:
req->ki_pos += ret;
return ret;
err_put_bio:
- __pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_disk_reservation_put(c, &dio->op.res);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
bio_put(bio);
@@ -1980,21 +2014,6 @@ err_put_bio:
goto err;
}
-ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
-{
- struct blk_plug plug;
- ssize_t ret;
-
- if (iov_iter_rw(iter) == WRITE)
- return -EINVAL;
-
- blk_start_plug(&plug);
- ret = bch2_direct_IO_read(req, iter);
- blk_finish_plug(&plug);
-
- return ret;
-}
-
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -2236,7 +2255,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
int ret = 0;
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
/*
* fetch current on disk i_size: inode is locked, i_size can only
@@ -2307,7 +2326,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
err:
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
return ret;
}
@@ -2316,14 +2335,13 @@ err:
static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
int ret = 0;
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
@@ -2352,7 +2370,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
i_sectors_acct(c, inode, NULL, i_sectors_delta);
}
err:
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
@@ -2383,7 +2401,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
*/
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
if (insert) {
ret = -EFBIG;
@@ -2570,7 +2588,7 @@ bkey_err:
}
err:
bch2_trans_exit(&trans);
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
}
@@ -2594,7 +2612,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
ret = inode_newsize_ok(&inode->v, end);
@@ -2737,7 +2755,7 @@ bkey_err:
}
err:
bch2_trans_exit(&trans);
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
}
@@ -2813,8 +2831,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct bch_inode_info *dst = file_bch_inode(file_dst);
struct bch_fs *c = src->v.i_sb->s_fs_info;
s64 i_sectors_delta = 0;
+ u64 aligned_len;
loff_t ret = 0;
- loff_t aligned_len;
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
return -EINVAL;
@@ -2830,26 +2848,23 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
abs(pos_src - pos_dst) < len)
return -EINVAL;
- bch2_lock_inodes(INODE_LOCK, src, dst);
+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
file_update_time(file_dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
- __pagecache_block_get(&src->v.i_mapping->add_lock);
- __pagecache_block_get(&dst->v.i_mapping->add_lock);
-
ret = generic_remap_file_range_prep(file_src, pos_src,
file_dst, pos_dst,
&len, remap_flags);
if (ret < 0 || len == 0)
goto err;
- aligned_len = round_up(len, block_bytes(c));
+ aligned_len = round_up((u64) len, block_bytes(c));
ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
- pos_dst, pos_dst + aligned_len);
+ pos_dst, pos_dst + len - 1);
if (ret)
goto err;
@@ -2864,24 +2879,20 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (ret < 0)
goto err;
- ret <<= 9;
/*
* due to alignment, we might have remapped slightly more than requsted
*/
- ret = min(ret, len);
+ ret = min((u64) ret << 9, (u64) len);
/* XXX get a quota reservation */
i_sectors_acct(c, dst, NULL, i_sectors_delta);
spin_lock(&dst->v.i_lock);
- if (pos_dst + len > dst->v.i_size)
- i_size_write(&dst->v, pos_dst + len);
+ if (pos_dst + ret > dst->v.i_size)
+ i_size_write(&dst->v, pos_dst + ret);
spin_unlock(&dst->v.i_lock);
err:
- __pagecache_block_put(&dst->v.i_mapping->add_lock);
- __pagecache_block_put(&src->v.i_mapping->add_lock);
-
- bch2_unlock_inodes(INODE_LOCK, src, dst);
+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
return ret;
}
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index ae171a29..7063556d 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -27,8 +27,7 @@ int bch2_write_begin(struct file *, struct address_space *, loff_t,
int bch2_write_end(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page *, void *);
-ssize_t bch2_direct_IO(struct kiocb *, struct iov_iter *);
-
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
int bch2_fsync(struct file *, loff_t, loff_t, int);
@@ -41,6 +40,7 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t bch2_llseek(struct file *, loff_t, int);
+vm_fault_t bch2_page_fault(struct vm_fault *);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
int bch2_releasepage(struct page *, gfp_t);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index f9b3650b..cd3540d0 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -49,6 +49,53 @@ static void journal_seq_copy(struct bch_inode_info *dst,
} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
}
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+ BUG_ON(atomic_long_read(&lock->v) == 0);
+
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
+ wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+ long v = atomic_long_read(&lock->v), old;
+
+ do {
+ old = v;
+
+ if (i > 0 ? v < 0 : v > 0)
+ return false;
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+ old, old + i)) != old);
+ return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *lock)
+{
+ __pagecache_lock_put(lock, 1);
+}
+
+void bch2_pagecache_add_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, 1);
+}
+
+void bch2_pagecache_block_put(struct pagecache_lock *lock)
+{
+ __pagecache_lock_put(lock, -1);
+}
+
+void bch2_pagecache_block_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, -1);
+}
+
void bch2_inode_update_after_write(struct bch_fs *c,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -706,10 +753,15 @@ static int bch2_getattr(const struct path *path, struct kstat *stat,
if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
+
if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
+ stat->attributes_mask |= STATX_ATTR_APPEND;
+
if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= STATX_ATTR_NODUMP;
return 0;
}
@@ -872,7 +924,7 @@ retry:
}
static const struct vm_operations_struct bch_vm_ops = {
- .fault = filemap_fault,
+ .fault = bch2_page_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = bch2_page_mkwrite,
};
@@ -906,7 +958,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
.mmap = bch2_mmap,
.open = generic_file_open,
@@ -994,7 +1046,7 @@ static const struct address_space_operations bch_address_space_operations = {
.write_end = bch2_write_end,
.invalidatepage = bch2_invalidatepage,
.releasepage = bch2_releasepage,
- .direct_IO = bch2_direct_IO,
+ .direct_IO = noop_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = bch2_migrate_page,
#endif
@@ -1090,6 +1142,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
+ pagecache_lock_init(&inode->ei_pagecache_lock);
mutex_init(&inode->ei_quota_lock);
inode->ei_journal_seq = 0;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 40605666..eda903a4 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -10,6 +10,26 @@
#include <linux/seqlock.h>
#include <linux/stat.h>
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+ atomic_long_t v;
+ wait_queue_head_t wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+ atomic_long_set(&lock->v, 0);
+ init_waitqueue_head(&lock->wait);
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *);
+void bch2_pagecache_add_get(struct pagecache_lock *);
+void bch2_pagecache_block_put(struct pagecache_lock *);
+void bch2_pagecache_block_get(struct pagecache_lock *);
+
struct bch_inode_info {
struct inode v;
@@ -18,6 +38,8 @@ struct bch_inode_info {
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
+ struct pagecache_lock ei_pagecache_lock;
+
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
@@ -37,7 +59,8 @@ static inline int ptrcmp(void *l, void *r)
enum bch_inode_lock_op {
INODE_LOCK = (1U << 0),
- INODE_UPDATE_LOCK = (1U << 1),
+ INODE_PAGECACHE_BLOCK = (1U << 1),
+ INODE_UPDATE_LOCK = (1U << 2),
};
#define bch2_lock_inodes(_locks, ...) \
@@ -49,9 +72,11 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if (_locks & INODE_LOCK) \
+ if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \
- if (_locks & INODE_UPDATE_LOCK) \
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
+ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+ if ((_locks) & INODE_UPDATE_LOCK) \
mutex_lock_nested(&a[i]->ei_update_lock, i);\
} \
} while (0)
@@ -65,9 +90,11 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if (_locks & INODE_LOCK) \
+ if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \
- if (_locks & INODE_UPDATE_LOCK) \
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
+ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+ if ((_locks) & INODE_UPDATE_LOCK) \
mutex_unlock(&a[i]->ei_update_lock); \
} \
} while (0)
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 3cced2b9..0f2308e5 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -797,7 +797,7 @@ create_lostfound:
bch2_create_trans(&trans,
BCACHEFS_ROOT_INO, root_inode,
lostfound_inode, &lostfound,
- 0, 0, S_IFDIR|0755, 0, NULL, NULL));
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL));
if (ret)
bch_err(c, "error creating lost+found: %i", ret);
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 836004b1..e3ef662e 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -1270,7 +1270,6 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
closure_return_with_destructor(cl, promote_done);
}
-noinline
static struct promote_op *__promote_alloc(struct bch_fs *c,
enum btree_id btree_id,
struct bpos pos,
@@ -1344,7 +1343,8 @@ err:
return NULL;
}
-static inline struct promote_op *promote_alloc(struct bch_fs *c,
+noinline
+static struct promote_op *promote_alloc(struct bch_fs *c,
struct bvec_iter iter,
struct bkey_s_c k,
struct extent_ptr_decoded *pick,
@@ -1908,7 +1908,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
@@ -1920,8 +1920,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
bounce = true;
}
- promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full);
+ if (orig->opts.promote_target)
+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+ &rbio, &bounce, &read_full);
if (!read_full) {
EBUG_ON(pick.crc.compression_type);
@@ -1949,7 +1950,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* data in the write path, but we're not going to use it all
* here:
*/
- BUG_ON(rbio->bio.bi_iter.bi_size <
+ EBUG_ON(rbio->bio.bi_iter.bi_size <
pick.crc.compressed_size << 9);
rbio->bio.bi_iter.bi_size =
pick.crc.compressed_size << 9;
@@ -1982,10 +1983,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
- BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
rbio->c = c;
rbio->submit_time = local_clock();
@@ -2001,6 +2002,7 @@ noclone:
rbio->hole = 0;
rbio->retry = 0;
rbio->context = 0;
+ /* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
rbio->pos = pos;
@@ -2017,11 +2019,11 @@ noclone:
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- percpu_down_read(&c->mark_lock);
+ rcu_read_lock();
bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
- percpu_up_read(&c->mark_lock);
+ rcu_read_unlock();
- if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
trace_read_split(&orig->bio);
}
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index bd2058f1..0ec0999a 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -68,6 +68,12 @@ enum opt_type {
* - helptext
*/
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS true
+#else
+#define RATELIMIT_ERRORS false
+#endif
+
#define BCH_OPTS() \
x(block_size, u16, \
OPT_FORMAT, \
@@ -227,6 +233,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Fix errors during fsck without asking") \
+ x(ratelimit_errors, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, RATELIMIT_ERRORS, \
+ NULL, "Ratelimit error messages during fsck") \
x(nochanges, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 6d45ae24..6e71c5e8 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -290,10 +290,12 @@ err:
ret2 = PTR_ERR_OR_ZERO(inode_iter);
if (!ret2 &&
- inode_u.bi_size < new_i_size)
+ inode_u.bi_size < new_i_size) {
+ inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL, journal_seq,
BTREE_INSERT_ATOMIC);
+ }
} while (ret2 == -EINTR);
ret = bch2_trans_exit(&trans) ?: ret;