1 // SPDX-License-Identifier: GPL-2.0
4 #include "btree_cache.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update_interior.h"
11 #include "journal_io.h"
12 #include "recovery_passes.h"
14 #include <linux/kthread.h>
15 #include <linux/min_heap.h>
16 #include <linux/sort.h>
18 struct find_btree_nodes_worker {
20 struct find_btree_nodes *f;
24 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
26 bch2_btree_id_level_to_text(out, n->btree_id, n->level);
27 prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
28 n->seq, n->journal_seq, n->cookie);
29 bch2_bpos_to_text(out, n->min_key);
31 bch2_bpos_to_text(out, n->max_key);
34 prt_str(out, " range updated");
36 for (unsigned i = 0; i < n->nr_ptrs; i++) {
38 bch2_extent_ptr_to_text(out, c, n->ptrs + i);
42 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
44 printbuf_indent_add(out, 2);
45 darray_for_each(nodes, i) {
46 found_btree_node_to_text(out, c, i);
49 printbuf_indent_sub(out, 2);
52 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
54 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
56 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
58 bp->v.seq = cpu_to_le64(f->cookie);
59 bp->v.sectors_written = 0;
61 bp->v.sectors_written = cpu_to_le16(f->sectors_written);
62 bp->v.min_key = f->min_key;
63 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
64 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
67 static inline u64 bkey_journal_seq(struct bkey_s_c k)
70 case KEY_TYPE_inode_v3:
71 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
77 static bool found_btree_node_is_readable(struct btree_trans *trans,
78 struct found_btree_node *f)
80 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
82 found_btree_node_to_key(&tmp.k, f);
84 struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
85 bool ret = !IS_ERR_OR_NULL(b);
89 f->sectors_written = b->written;
90 f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
94 struct btree_node_iter iter;
95 for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
96 f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
98 six_unlock_read(&b->c.lock);
101 * We might update this node's range; if that happens, we need the node
102 * to be re-read so the read path can trim keys that are no longer in
105 if (b != btree_node_root(trans->c, b))
106 bch2_btree_node_evict(trans, &tmp.k);
110 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
112 const struct found_btree_node *l = _l;
113 const struct found_btree_node *r = _r;
115 return cmp_int(l->btree_id, r->btree_id) ?:
116 cmp_int(l->level, r->level) ?:
117 cmp_int(l->cookie, r->cookie);
121 * Given two found btree nodes, if their sequence numbers are equal, take the
122 * one that's readable:
124 static int found_btree_node_cmp_time(const struct found_btree_node *l,
125 const struct found_btree_node *r)
127 return cmp_int(l->seq, r->seq) ?:
128 cmp_int(l->journal_seq, r->journal_seq);
131 static int found_btree_node_cmp_pos(const void *_l, const void *_r)
133 const struct found_btree_node *l = _l;
134 const struct found_btree_node *r = _r;
136 return cmp_int(l->btree_id, r->btree_id) ?:
137 -cmp_int(l->level, r->level) ?:
138 bpos_cmp(l->min_key, r->min_key) ?:
139 -found_btree_node_cmp_time(l, r);
142 static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
144 return found_btree_node_cmp_pos(l, r) < 0;
147 static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
149 struct found_btree_node *l = _l;
150 struct found_btree_node *r = _r;
155 static const struct min_heap_callbacks found_btree_node_heap_cbs = {
156 .less = found_btree_node_cmp_pos_less,
157 .swp = found_btree_node_swap,
160 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
161 struct bio *bio, struct btree_node *bn, u64 offset)
163 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
165 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
166 bio->bi_iter.bi_sector = offset;
167 bch2_bio_map(bio, bn, PAGE_SIZE);
169 submit_bio_wait(bio);
170 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
171 "IO error in try_read_btree_node() at %llu: %s",
172 offset, bch2_blk_status_to_str(bio->bi_status)))
175 if (le64_to_cpu(bn->magic) != bset_magic(c))
178 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
182 struct nonce nonce = btree_nonce(&bn->keys, 0);
183 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
185 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
188 if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
191 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
194 if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
198 struct found_btree_node n = {
199 .btree_id = BTREE_NODE_ID(bn),
200 .level = BTREE_NODE_LEVEL(bn),
201 .seq = BTREE_NODE_SEQ(bn),
202 .cookie = le64_to_cpu(bn->keys.seq),
203 .min_key = bn->min_key,
204 .max_key = bn->max_key,
206 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
207 .ptrs[0].offset = offset,
208 .ptrs[0].dev = ca->dev_idx,
209 .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
213 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
214 mutex_lock(&f->lock);
215 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
216 bch_err(c, "try_read_btree_node() can't handle endian conversion");
221 if (darray_push(&f->nodes, n))
224 mutex_unlock(&f->lock);
228 static int read_btree_nodes_worker(void *p)
230 struct find_btree_nodes_worker *w = p;
231 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
232 struct bch_dev *ca = w->ca;
233 void *buf = (void *) __get_free_page(GFP_KERNEL);
234 struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
235 unsigned long last_print = jiffies;
238 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
243 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
244 for (unsigned bucket_offset = 0;
245 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
246 bucket_offset += btree_sectors(c)) {
247 if (time_after(jiffies, last_print + HZ * 30)) {
248 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
249 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
251 bch_info(ca, "%s: %2u%% done", __func__,
252 (unsigned) div64_u64(cur_sector * 100, end_sector));
253 last_print = jiffies;
256 u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
258 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
259 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
262 try_read_btree_node(w->f, ca, bio, buf, sector);
266 free_page((unsigned long) buf);
267 percpu_ref_get(&ca->io_ref);
273 static int read_btree_nodes(struct find_btree_nodes *f)
275 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
279 closure_init_stack(&cl);
281 for_each_online_member(c, ca) {
282 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
285 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
286 struct task_struct *t;
289 percpu_ref_put(&ca->io_ref);
294 percpu_ref_get(&ca->io_ref);
300 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
301 ret = PTR_ERR_OR_ZERO(t);
303 percpu_ref_put(&ca->io_ref);
306 bch_err(c, "error starting kthread: %i", ret);
312 return f->ret ?: ret;
315 static bool nodes_overlap(const struct found_btree_node *l,
316 const struct found_btree_node *r)
318 return (l->btree_id == r->btree_id &&
319 l->level == r->level &&
320 bpos_gt(l->max_key, r->min_key));
323 static int handle_overwrites(struct bch_fs *c,
324 struct found_btree_node *l,
325 found_btree_nodes *nodes_heap)
327 struct found_btree_node *r;
329 while ((r = min_heap_peek(nodes_heap)) &&
330 nodes_overlap(l, r)) {
331 int cmp = found_btree_node_cmp_time(l, r);
334 if (bpos_cmp(l->max_key, r->max_key) >= 0)
335 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
337 r->range_updated = true;
338 r->min_key = bpos_successor(l->max_key);
339 r->range_updated = true;
340 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
342 } else if (cmp < 0) {
343 BUG_ON(bpos_eq(l->min_key, r->min_key));
345 l->max_key = bpos_predecessor(r->min_key);
346 l->range_updated = true;
347 } else if (r->level) {
348 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
350 if (bpos_cmp(l->max_key, r->max_key) >= 0)
351 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
353 r->range_updated = true;
354 r->min_key = bpos_successor(l->max_key);
355 r->range_updated = true;
356 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
364 int bch2_scan_for_btree_nodes(struct bch_fs *c)
366 struct find_btree_nodes *f = &c->found_btree_nodes;
367 struct printbuf buf = PRINTBUF;
368 found_btree_nodes nodes_heap = {};
375 mutex_init(&f->lock);
377 ret = read_btree_nodes(f);
382 bch_err(c, "%s: no btree nodes found", __func__);
387 if (0 && c->opts.verbose) {
388 printbuf_reset(&buf);
389 prt_printf(&buf, "%s: nodes found:\n", __func__);
390 found_btree_nodes_to_text(&buf, c, f->nodes);
391 bch2_print_string_as_lines(KERN_INFO, buf.buf);
394 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
397 darray_for_each(f->nodes, i) {
398 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
401 prev->cookie == i->cookie) {
402 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
403 bch_err(c, "%s: found too many replicas for btree node", __func__);
407 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
409 f->nodes.data[dst++] = *i;
414 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
416 if (0 && c->opts.verbose) {
417 printbuf_reset(&buf);
418 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
419 found_btree_nodes_to_text(&buf, c, f->nodes);
420 bch2_print_string_as_lines(KERN_INFO, buf.buf);
423 swap(nodes_heap, f->nodes);
426 /* darray must have same layout as a heap */
427 min_heap_char real_heap;
428 BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
429 BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
430 BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
431 BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
434 min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
437 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
441 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
445 ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
452 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
456 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
459 for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
460 BUG_ON(nodes_overlap(n, n + 1));
462 if (0 && c->opts.verbose) {
463 printbuf_reset(&buf);
464 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
465 found_btree_nodes_to_text(&buf, c, f->nodes);
466 bch2_print_string_as_lines(KERN_INFO, buf.buf);
468 bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
471 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
473 darray_exit(&nodes_heap);
478 static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
480 const struct found_btree_node *l = _l;
481 const struct found_btree_node *r = _r;
483 return cmp_int(l->btree_id, r->btree_id) ?:
484 -cmp_int(l->level, r->level) ?:
485 bpos_cmp(l->max_key, r->min_key);
488 #define for_each_found_btree_node_in_range(_f, _search, _idx) \
489 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
490 sizeof((_f)->nodes.data[0]), \
491 found_btree_node_range_start_cmp, &search); \
492 _idx < (_f)->nodes.nr && \
493 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
494 (_f)->nodes.data[_idx].level == _search.level && \
495 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
496 _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
498 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
500 struct find_btree_nodes *f = &c->found_btree_nodes;
502 struct found_btree_node search = {
503 .btree_id = b->c.btree_id,
505 .min_key = b->data->min_key,
506 .max_key = b->key.k.p,
509 for_each_found_btree_node_in_range(f, search, idx)
510 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
515 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
517 struct found_btree_node search = {
524 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
529 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
530 unsigned level, struct bpos node_min, struct bpos node_max)
532 if (btree_id_is_alloc(btree))
535 struct find_btree_nodes *f = &c->found_btree_nodes;
537 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
541 if (c->opts.verbose) {
542 struct printbuf buf = PRINTBUF;
544 prt_str(&buf, "recovery ");
545 bch2_btree_id_level_to_text(&buf, btree, level);
547 bch2_bpos_to_text(&buf, node_min);
548 prt_str(&buf, " - ");
549 bch2_bpos_to_text(&buf, node_max);
551 bch_info(c, "%s(): %s", __func__, buf.buf);
555 struct found_btree_node search = {
562 for_each_found_btree_node_in_range(f, search, idx) {
563 struct found_btree_node n = f->nodes.data[idx];
565 n.range_updated |= bpos_lt(n.min_key, node_min);
566 n.min_key = bpos_max(n.min_key, node_min);
568 n.range_updated |= bpos_gt(n.max_key, node_max);
569 n.max_key = bpos_min(n.max_key, node_max);
571 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
573 found_btree_node_to_key(&tmp.k, &n);
575 struct printbuf buf = PRINTBUF;
576 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
577 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
580 BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
581 (struct bkey_validate_context) {
582 .from = BKEY_VALIDATE_btree_node,
587 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
595 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
597 darray_exit(&f->nodes);