]> Git Repo - linux.git/blob - fs/bcachefs/btree_node_scan.c
Linux 6.14-rc3
[linux.git] / fs / bcachefs / btree_node_scan.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update_interior.h"
9 #include "buckets.h"
10 #include "error.h"
11 #include "journal_io.h"
12 #include "recovery_passes.h"
13
14 #include <linux/kthread.h>
15 #include <linux/min_heap.h>
16 #include <linux/sort.h>
17
18 struct find_btree_nodes_worker {
19         struct closure          *cl;
20         struct find_btree_nodes *f;
21         struct bch_dev          *ca;
22 };
23
24 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
25 {
26         bch2_btree_id_level_to_text(out, n->btree_id, n->level);
27         prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
28                    n->seq, n->journal_seq, n->cookie);
29         bch2_bpos_to_text(out, n->min_key);
30         prt_str(out, "-");
31         bch2_bpos_to_text(out, n->max_key);
32
33         if (n->range_updated)
34                 prt_str(out, " range updated");
35
36         for (unsigned i = 0; i < n->nr_ptrs; i++) {
37                 prt_char(out, ' ');
38                 bch2_extent_ptr_to_text(out, c, n->ptrs + i);
39         }
40 }
41
42 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
43 {
44         printbuf_indent_add(out, 2);
45         darray_for_each(nodes, i) {
46                 found_btree_node_to_text(out, c, i);
47                 prt_newline(out);
48         }
49         printbuf_indent_sub(out, 2);
50 }
51
52 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
53 {
54         struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
55
56         set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
57         bp->k.p                 = f->max_key;
58         bp->v.seq               = cpu_to_le64(f->cookie);
59         bp->v.sectors_written   = 0;
60         bp->v.flags             = 0;
61         bp->v.sectors_written   = cpu_to_le16(f->sectors_written);
62         bp->v.min_key           = f->min_key;
63         SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
64         memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
65 }
66
67 static inline u64 bkey_journal_seq(struct bkey_s_c k)
68 {
69         switch (k.k->type) {
70         case KEY_TYPE_inode_v3:
71                 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
72         default:
73                 return 0;
74         }
75 }
76
77 static bool found_btree_node_is_readable(struct btree_trans *trans,
78                                          struct found_btree_node *f)
79 {
80         struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
81
82         found_btree_node_to_key(&tmp.k, f);
83
84         struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
85         bool ret = !IS_ERR_OR_NULL(b);
86         if (!ret)
87                 return ret;
88
89         f->sectors_written = b->written;
90         f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
91
92         struct bkey_s_c k;
93         struct bkey unpacked;
94         struct btree_node_iter iter;
95         for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
96                 f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
97
98         six_unlock_read(&b->c.lock);
99
100         /*
101          * We might update this node's range; if that happens, we need the node
102          * to be re-read so the read path can trim keys that are no longer in
103          * this node
104          */
105         if (b != btree_node_root(trans->c, b))
106                 bch2_btree_node_evict(trans, &tmp.k);
107         return ret;
108 }
109
110 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
111 {
112         const struct found_btree_node *l = _l;
113         const struct found_btree_node *r = _r;
114
115         return  cmp_int(l->btree_id,    r->btree_id) ?:
116                 cmp_int(l->level,       r->level) ?:
117                 cmp_int(l->cookie,      r->cookie);
118 }
119
120 /*
121  * Given two found btree nodes, if their sequence numbers are equal, take the
122  * one that's readable:
123  */
124 static int found_btree_node_cmp_time(const struct found_btree_node *l,
125                                      const struct found_btree_node *r)
126 {
127         return  cmp_int(l->seq, r->seq) ?:
128                 cmp_int(l->journal_seq, r->journal_seq);
129 }
130
131 static int found_btree_node_cmp_pos(const void *_l, const void *_r)
132 {
133         const struct found_btree_node *l = _l;
134         const struct found_btree_node *r = _r;
135
136         return  cmp_int(l->btree_id,    r->btree_id) ?:
137                -cmp_int(l->level,       r->level) ?:
138                 bpos_cmp(l->min_key,    r->min_key) ?:
139                -found_btree_node_cmp_time(l, r);
140 }
141
142 static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
143 {
144         return found_btree_node_cmp_pos(l, r) < 0;
145 }
146
147 static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
148 {
149         struct found_btree_node *l = _l;
150         struct found_btree_node *r = _r;
151
152         swap(*l, *r);
153 }
154
155 static const struct min_heap_callbacks found_btree_node_heap_cbs = {
156         .less = found_btree_node_cmp_pos_less,
157         .swp = found_btree_node_swap,
158 };
159
160 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
161                                 struct bio *bio, struct btree_node *bn, u64 offset)
162 {
163         struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
164
165         bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
166         bio->bi_iter.bi_sector  = offset;
167         bch2_bio_map(bio, bn, PAGE_SIZE);
168
169         submit_bio_wait(bio);
170         if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
171                                "IO error in try_read_btree_node() at %llu: %s",
172                                offset, bch2_blk_status_to_str(bio->bi_status)))
173                 return;
174
175         if (le64_to_cpu(bn->magic) != bset_magic(c))
176                 return;
177
178         if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
179                 if (!c->chacha20)
180                         return;
181
182                 struct nonce nonce = btree_nonce(&bn->keys, 0);
183                 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
184
185                 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
186         }
187
188         if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
189                 return;
190
191         if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
192                 return;
193
194         if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
195                 return;
196
197         rcu_read_lock();
198         struct found_btree_node n = {
199                 .btree_id       = BTREE_NODE_ID(bn),
200                 .level          = BTREE_NODE_LEVEL(bn),
201                 .seq            = BTREE_NODE_SEQ(bn),
202                 .cookie         = le64_to_cpu(bn->keys.seq),
203                 .min_key        = bn->min_key,
204                 .max_key        = bn->max_key,
205                 .nr_ptrs        = 1,
206                 .ptrs[0].type   = 1 << BCH_EXTENT_ENTRY_ptr,
207                 .ptrs[0].offset = offset,
208                 .ptrs[0].dev    = ca->dev_idx,
209                 .ptrs[0].gen    = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
210         };
211         rcu_read_unlock();
212
213         if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
214                 mutex_lock(&f->lock);
215                 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
216                         bch_err(c, "try_read_btree_node() can't handle endian conversion");
217                         f->ret = -EINVAL;
218                         goto unlock;
219                 }
220
221                 if (darray_push(&f->nodes, n))
222                         f->ret = -ENOMEM;
223 unlock:
224                 mutex_unlock(&f->lock);
225         }
226 }
227
228 static int read_btree_nodes_worker(void *p)
229 {
230         struct find_btree_nodes_worker *w = p;
231         struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
232         struct bch_dev *ca = w->ca;
233         void *buf = (void *) __get_free_page(GFP_KERNEL);
234         struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
235         unsigned long last_print = jiffies;
236
237         if (!buf || !bio) {
238                 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
239                 w->f->ret = -ENOMEM;
240                 goto err;
241         }
242
243         for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
244                 for (unsigned bucket_offset = 0;
245                      bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
246                      bucket_offset += btree_sectors(c)) {
247                         if (time_after(jiffies, last_print + HZ * 30)) {
248                                 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
249                                 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
250
251                                 bch_info(ca, "%s: %2u%% done", __func__,
252                                          (unsigned) div64_u64(cur_sector * 100, end_sector));
253                                 last_print = jiffies;
254                         }
255
256                         u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
257
258                         if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
259                             !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
260                                 continue;
261
262                         try_read_btree_node(w->f, ca, bio, buf, sector);
263                 }
264 err:
265         bio_put(bio);
266         free_page((unsigned long) buf);
267         percpu_ref_get(&ca->io_ref);
268         closure_put(w->cl);
269         kfree(w);
270         return 0;
271 }
272
273 static int read_btree_nodes(struct find_btree_nodes *f)
274 {
275         struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
276         struct closure cl;
277         int ret = 0;
278
279         closure_init_stack(&cl);
280
281         for_each_online_member(c, ca) {
282                 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
283                         continue;
284
285                 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
286                 struct task_struct *t;
287
288                 if (!w) {
289                         percpu_ref_put(&ca->io_ref);
290                         ret = -ENOMEM;
291                         goto err;
292                 }
293
294                 percpu_ref_get(&ca->io_ref);
295                 closure_get(&cl);
296                 w->cl           = &cl;
297                 w->f            = f;
298                 w->ca           = ca;
299
300                 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
301                 ret = PTR_ERR_OR_ZERO(t);
302                 if (ret) {
303                         percpu_ref_put(&ca->io_ref);
304                         closure_put(&cl);
305                         f->ret = ret;
306                         bch_err(c, "error starting kthread: %i", ret);
307                         break;
308                 }
309         }
310 err:
311         closure_sync(&cl);
312         return f->ret ?: ret;
313 }
314
315 static bool nodes_overlap(const struct found_btree_node *l,
316                           const struct found_btree_node *r)
317 {
318         return (l->btree_id     == r->btree_id &&
319                 l->level        == r->level &&
320                 bpos_gt(l->max_key, r->min_key));
321 }
322
323 static int handle_overwrites(struct bch_fs *c,
324                              struct found_btree_node *l,
325                              found_btree_nodes *nodes_heap)
326 {
327         struct found_btree_node *r;
328
329         while ((r = min_heap_peek(nodes_heap)) &&
330                nodes_overlap(l, r)) {
331                 int cmp = found_btree_node_cmp_time(l, r);
332
333                 if (cmp > 0) {
334                         if (bpos_cmp(l->max_key, r->max_key) >= 0)
335                                 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
336                         else {
337                                 r->range_updated = true;
338                                 r->min_key = bpos_successor(l->max_key);
339                                 r->range_updated = true;
340                                 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
341                         }
342                 } else if (cmp < 0) {
343                         BUG_ON(bpos_eq(l->min_key, r->min_key));
344
345                         l->max_key = bpos_predecessor(r->min_key);
346                         l->range_updated = true;
347                 } else if (r->level) {
348                         min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
349                 } else {
350                         if (bpos_cmp(l->max_key, r->max_key) >= 0)
351                                 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
352                         else {
353                                 r->range_updated = true;
354                                 r->min_key = bpos_successor(l->max_key);
355                                 r->range_updated = true;
356                                 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
357                         }
358                 }
359         }
360
361         return 0;
362 }
363
364 int bch2_scan_for_btree_nodes(struct bch_fs *c)
365 {
366         struct find_btree_nodes *f = &c->found_btree_nodes;
367         struct printbuf buf = PRINTBUF;
368         found_btree_nodes nodes_heap = {};
369         size_t dst;
370         int ret = 0;
371
372         if (f->nodes.nr)
373                 return 0;
374
375         mutex_init(&f->lock);
376
377         ret = read_btree_nodes(f);
378         if (ret)
379                 return ret;
380
381         if (!f->nodes.nr) {
382                 bch_err(c, "%s: no btree nodes found", __func__);
383                 ret = -EINVAL;
384                 goto err;
385         }
386
387         if (0 && c->opts.verbose) {
388                 printbuf_reset(&buf);
389                 prt_printf(&buf, "%s: nodes found:\n", __func__);
390                 found_btree_nodes_to_text(&buf, c, f->nodes);
391                 bch2_print_string_as_lines(KERN_INFO, buf.buf);
392         }
393
394         sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
395
396         dst = 0;
397         darray_for_each(f->nodes, i) {
398                 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
399
400                 if (prev &&
401                     prev->cookie == i->cookie) {
402                         if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
403                                 bch_err(c, "%s: found too many replicas for btree node", __func__);
404                                 ret = -EINVAL;
405                                 goto err;
406                         }
407                         prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
408                 } else {
409                         f->nodes.data[dst++] = *i;
410                 }
411         }
412         f->nodes.nr = dst;
413
414         sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
415
416         if (0 && c->opts.verbose) {
417                 printbuf_reset(&buf);
418                 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
419                 found_btree_nodes_to_text(&buf, c, f->nodes);
420                 bch2_print_string_as_lines(KERN_INFO, buf.buf);
421         }
422
423         swap(nodes_heap, f->nodes);
424
425         {
426                 /* darray must have same layout as a heap */
427                 min_heap_char real_heap;
428                 BUILD_BUG_ON(sizeof(nodes_heap.nr)      != sizeof(real_heap.nr));
429                 BUILD_BUG_ON(sizeof(nodes_heap.size)    != sizeof(real_heap.size));
430                 BUILD_BUG_ON(offsetof(found_btree_nodes, nr)    != offsetof(min_heap_char, nr));
431                 BUILD_BUG_ON(offsetof(found_btree_nodes, size)  != offsetof(min_heap_char, size));
432         }
433
434         min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
435
436         if (nodes_heap.nr) {
437                 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
438                 if (ret)
439                         goto err;
440
441                 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
442         }
443
444         while (true) {
445                 ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
446                 if (ret)
447                         goto err;
448
449                 if (!nodes_heap.nr)
450                         break;
451
452                 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
453                 if (ret)
454                         goto err;
455
456                 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
457         }
458
459         for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
460                 BUG_ON(nodes_overlap(n, n + 1));
461
462         if (0 && c->opts.verbose) {
463                 printbuf_reset(&buf);
464                 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
465                 found_btree_nodes_to_text(&buf, c, f->nodes);
466                 bch2_print_string_as_lines(KERN_INFO, buf.buf);
467         } else {
468                 bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
469         }
470
471         eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
472 err:
473         darray_exit(&nodes_heap);
474         printbuf_exit(&buf);
475         return ret;
476 }
477
478 static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
479 {
480         const struct found_btree_node *l = _l;
481         const struct found_btree_node *r = _r;
482
483         return  cmp_int(l->btree_id,    r->btree_id) ?:
484                -cmp_int(l->level,       r->level) ?:
485                 bpos_cmp(l->max_key,    r->min_key);
486 }
487
488 #define for_each_found_btree_node_in_range(_f, _search, _idx)                           \
489         for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,         \
490                                         sizeof((_f)->nodes.data[0]),                    \
491                                         found_btree_node_range_start_cmp, &search);     \
492              _idx < (_f)->nodes.nr &&                                                   \
493              (_f)->nodes.data[_idx].btree_id == _search.btree_id &&                     \
494              (_f)->nodes.data[_idx].level == _search.level &&                           \
495              bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);                  \
496              _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
497
498 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
499 {
500         struct find_btree_nodes *f = &c->found_btree_nodes;
501
502         struct found_btree_node search = {
503                 .btree_id       = b->c.btree_id,
504                 .level          = b->c.level,
505                 .min_key        = b->data->min_key,
506                 .max_key        = b->key.k.p,
507         };
508
509         for_each_found_btree_node_in_range(f, search, idx)
510                 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
511                         return true;
512         return false;
513 }
514
515 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
516 {
517         struct found_btree_node search = {
518                 .btree_id       = btree,
519                 .level          = 0,
520                 .min_key        = POS_MIN,
521                 .max_key        = SPOS_MAX,
522         };
523
524         for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
525                 return true;
526         return false;
527 }
528
529 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
530                            unsigned level, struct bpos node_min, struct bpos node_max)
531 {
532         if (btree_id_is_alloc(btree))
533                 return 0;
534
535         struct find_btree_nodes *f = &c->found_btree_nodes;
536
537         int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
538         if (ret)
539                 return ret;
540
541         if (c->opts.verbose) {
542                 struct printbuf buf = PRINTBUF;
543
544                 prt_str(&buf, "recovery ");
545                 bch2_btree_id_level_to_text(&buf, btree, level);
546                 prt_str(&buf, " ");
547                 bch2_bpos_to_text(&buf, node_min);
548                 prt_str(&buf, " - ");
549                 bch2_bpos_to_text(&buf, node_max);
550
551                 bch_info(c, "%s(): %s", __func__, buf.buf);
552                 printbuf_exit(&buf);
553         }
554
555         struct found_btree_node search = {
556                 .btree_id       = btree,
557                 .level          = level,
558                 .min_key        = node_min,
559                 .max_key        = node_max,
560         };
561
562         for_each_found_btree_node_in_range(f, search, idx) {
563                 struct found_btree_node n = f->nodes.data[idx];
564
565                 n.range_updated |= bpos_lt(n.min_key, node_min);
566                 n.min_key = bpos_max(n.min_key, node_min);
567
568                 n.range_updated |= bpos_gt(n.max_key, node_max);
569                 n.max_key = bpos_min(n.max_key, node_max);
570
571                 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
572
573                 found_btree_node_to_key(&tmp.k, &n);
574
575                 struct printbuf buf = PRINTBUF;
576                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
577                 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
578                 printbuf_exit(&buf);
579
580                 BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
581                                           (struct bkey_validate_context) {
582                                                 .from   = BKEY_VALIDATE_btree_node,
583                                                 .level  = level + 1,
584                                                 .btree  = btree,
585                                           }));
586
587                 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
588                 if (ret)
589                         return ret;
590         }
591
592         return 0;
593 }
594
595 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
596 {
597         darray_exit(&f->nodes);
598 }
This page took 0.071397 seconds and 4 git commands to generate.