]> Git Repo - J-linux.git/blob - fs/bcachefs/btree_journal_iter.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / fs / bcachefs / btree_journal_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "bset.h"
6 #include "btree_cache.h"
7 #include "btree_journal_iter.h"
8 #include "journal_io.h"
9
10 #include <linux/sort.h>
11
12 /*
13  * For managing keys we read from the journal: until journal replay works normal
14  * btree lookups need to be able to find and return keys from the journal where
15  * they overwrite what's in the btree, so we have a special iterator and
16  * operations for the regular btree iter code to use:
17  */
18
19 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
20 {
21         size_t gap_size = keys->size - keys->nr;
22
23         if (idx >= keys->gap)
24                 idx += gap_size;
25         return idx;
26 }
27
28 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
29 {
30         return keys->data + idx_to_pos(keys, idx);
31 }
32
33 static size_t __bch2_journal_key_search(struct journal_keys *keys,
34                                         enum btree_id id, unsigned level,
35                                         struct bpos pos)
36 {
37         size_t l = 0, r = keys->nr, m;
38
39         while (l < r) {
40                 m = l + ((r - l) >> 1);
41                 if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
42                         l = m + 1;
43                 else
44                         r = m;
45         }
46
47         BUG_ON(l < keys->nr &&
48                __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
49
50         BUG_ON(l &&
51                __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
52
53         return l;
54 }
55
56 static size_t bch2_journal_key_search(struct journal_keys *keys,
57                                       enum btree_id id, unsigned level,
58                                       struct bpos pos)
59 {
60         return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
61 }
62
63 /* Returns first non-overwritten key >= search key: */
64 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
65                                            unsigned level, struct bpos pos,
66                                            struct bpos end_pos, size_t *idx)
67 {
68         struct journal_keys *keys = &c->journal_keys;
69         unsigned iters = 0;
70         struct journal_key *k;
71
72         BUG_ON(*idx > keys->nr);
73 search:
74         if (!*idx)
75                 *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
76
77         while (*idx &&
78                __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
79                 --(*idx);
80                 iters++;
81                 if (iters == 10) {
82                         *idx = 0;
83                         goto search;
84                 }
85         }
86
87         while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
88                 if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
89                         return NULL;
90
91                 if (k->overwritten) {
92                         (*idx)++;
93                         continue;
94                 }
95
96                 if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
97                         return k->k;
98
99                 (*idx)++;
100                 iters++;
101                 if (iters == 10) {
102                         *idx = 0;
103                         goto search;
104                 }
105         }
106
107         return NULL;
108 }
109
110 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
111                                            unsigned level, struct bpos pos)
112 {
113         size_t idx = 0;
114
115         return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
116 }
117
118 static void journal_iter_verify(struct journal_iter *iter)
119 {
120         struct journal_keys *keys = iter->keys;
121         size_t gap_size = keys->size - keys->nr;
122
123         BUG_ON(iter->idx >= keys->gap &&
124                iter->idx <  keys->gap + gap_size);
125
126         if (iter->idx < keys->size) {
127                 struct journal_key *k = keys->data + iter->idx;
128
129                 int cmp = cmp_int(k->btree_id,  iter->btree_id) ?:
130                           cmp_int(k->level,     iter->level);
131                 BUG_ON(cmp < 0);
132         }
133 }
134
135 static void journal_iters_fix(struct bch_fs *c)
136 {
137         struct journal_keys *keys = &c->journal_keys;
138         /* The key we just inserted is immediately before the gap: */
139         size_t gap_end = keys->gap + (keys->size - keys->nr);
140         struct journal_key *new_key = &keys->data[keys->gap - 1];
141         struct journal_iter *iter;
142
143         /*
144          * If an iterator points one after the key we just inserted, decrement
145          * the iterator so it points at the key we just inserted - if the
146          * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
147          * handle that:
148          */
149         list_for_each_entry(iter, &c->journal_iters, list) {
150                 journal_iter_verify(iter);
151                 if (iter->idx           == gap_end &&
152                     new_key->btree_id   == iter->btree_id &&
153                     new_key->level      == iter->level)
154                         iter->idx = keys->gap - 1;
155                 journal_iter_verify(iter);
156         }
157 }
158
159 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
160 {
161         struct journal_keys *keys = &c->journal_keys;
162         struct journal_iter *iter;
163         size_t gap_size = keys->size - keys->nr;
164
165         list_for_each_entry(iter, &c->journal_iters, list) {
166                 if (iter->idx > old_gap)
167                         iter->idx -= gap_size;
168                 if (iter->idx >= new_gap)
169                         iter->idx += gap_size;
170         }
171 }
172
173 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
174                                  unsigned level, struct bkey_i *k)
175 {
176         struct journal_key n = {
177                 .btree_id       = id,
178                 .level          = level,
179                 .k              = k,
180                 .allocated      = true,
181                 /*
182                  * Ensure these keys are done last by journal replay, to unblock
183                  * journal reclaim:
184                  */
185                 .journal_seq    = U32_MAX,
186         };
187         struct journal_keys *keys = &c->journal_keys;
188         size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
189
190         BUG_ON(test_bit(BCH_FS_rw, &c->flags));
191
192         if (idx < keys->size &&
193             journal_key_cmp(&n, &keys->data[idx]) == 0) {
194                 if (keys->data[idx].allocated)
195                         kfree(keys->data[idx].k);
196                 keys->data[idx] = n;
197                 return 0;
198         }
199
200         if (idx > keys->gap)
201                 idx -= keys->size - keys->nr;
202
203         size_t old_gap = keys->gap;
204
205         if (keys->nr == keys->size) {
206                 journal_iters_move_gap(c, old_gap, keys->size);
207                 old_gap = keys->size;
208
209                 struct journal_keys new_keys = {
210                         .nr                     = keys->nr,
211                         .size                   = max_t(size_t, keys->size, 8) * 2,
212                 };
213
214                 new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
215                 if (!new_keys.data) {
216                         bch_err(c, "%s: error allocating new key array (size %zu)",
217                                 __func__, new_keys.size);
218                         return -BCH_ERR_ENOMEM_journal_key_insert;
219                 }
220
221                 /* Since @keys was full, there was no gap: */
222                 memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
223                 kvfree(keys->data);
224                 keys->data      = new_keys.data;
225                 keys->nr        = new_keys.nr;
226                 keys->size      = new_keys.size;
227
228                 /* And now the gap is at the end: */
229                 keys->gap       = keys->nr;
230         }
231
232         journal_iters_move_gap(c, old_gap, idx);
233
234         move_gap(keys, idx);
235
236         keys->nr++;
237         keys->data[keys->gap++] = n;
238
239         journal_iters_fix(c);
240
241         return 0;
242 }
243
244 /*
245  * Can only be used from the recovery thread while we're still RO - can't be
246  * used once we've got RW, as journal_keys is at that point used by multiple
247  * threads:
248  */
249 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
250                             unsigned level, struct bkey_i *k)
251 {
252         struct bkey_i *n;
253         int ret;
254
255         n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
256         if (!n)
257                 return -BCH_ERR_ENOMEM_journal_key_insert;
258
259         bkey_copy(n, k);
260         ret = bch2_journal_key_insert_take(c, id, level, n);
261         if (ret)
262                 kfree(n);
263         return ret;
264 }
265
266 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
267                             unsigned level, struct bpos pos)
268 {
269         struct bkey_i whiteout;
270
271         bkey_init(&whiteout.k);
272         whiteout.k.p = pos;
273
274         return bch2_journal_key_insert(c, id, level, &whiteout);
275 }
276
277 bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
278                                  unsigned level, struct bpos pos)
279 {
280         struct journal_keys *keys = &trans->c->journal_keys;
281         size_t idx = bch2_journal_key_search(keys, btree, level, pos);
282
283         if (!trans->journal_replay_not_finished)
284                 return false;
285
286         return (idx < keys->size &&
287                 keys->data[idx].btree_id        == btree &&
288                 keys->data[idx].level           == level &&
289                 bpos_eq(keys->data[idx].k->k.p, pos) &&
290                 bkey_deleted(&keys->data[idx].k->k));
291 }
292
293 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
294                                   unsigned level, struct bpos pos)
295 {
296         struct journal_keys *keys = &c->journal_keys;
297         size_t idx = bch2_journal_key_search(keys, btree, level, pos);
298
299         if (idx < keys->size &&
300             keys->data[idx].btree_id    == btree &&
301             keys->data[idx].level       == level &&
302             bpos_eq(keys->data[idx].k->k.p, pos))
303                 keys->data[idx].overwritten = true;
304 }
305
306 static void bch2_journal_iter_advance(struct journal_iter *iter)
307 {
308         if (iter->idx < iter->keys->size) {
309                 iter->idx++;
310                 if (iter->idx == iter->keys->gap)
311                         iter->idx += iter->keys->size - iter->keys->nr;
312         }
313 }
314
315 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
316 {
317         journal_iter_verify(iter);
318
319         while (iter->idx < iter->keys->size) {
320                 struct journal_key *k = iter->keys->data + iter->idx;
321
322                 int cmp = cmp_int(k->btree_id,  iter->btree_id) ?:
323                           cmp_int(k->level,     iter->level);
324                 if (cmp > 0)
325                         break;
326                 BUG_ON(cmp);
327
328                 if (!k->overwritten)
329                         return bkey_i_to_s_c(k->k);
330
331                 bch2_journal_iter_advance(iter);
332         }
333
334         return bkey_s_c_null;
335 }
336
337 static void bch2_journal_iter_exit(struct journal_iter *iter)
338 {
339         list_del(&iter->list);
340 }
341
342 static void bch2_journal_iter_init(struct bch_fs *c,
343                                    struct journal_iter *iter,
344                                    enum btree_id id, unsigned level,
345                                    struct bpos pos)
346 {
347         iter->btree_id  = id;
348         iter->level     = level;
349         iter->keys      = &c->journal_keys;
350         iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
351
352         journal_iter_verify(iter);
353 }
354
355 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
356 {
357         return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
358                                                 iter->b, &iter->unpacked);
359 }
360
361 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
362 {
363         bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
364 }
365
366 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
367 {
368         if (bpos_eq(iter->pos, SPOS_MAX))
369                 iter->at_end = true;
370         else
371                 iter->pos = bpos_successor(iter->pos);
372 }
373
374 static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
375 {
376         struct btree_and_journal_iter iter = *_iter;
377         struct bch_fs *c = iter.trans->c;
378         unsigned level = iter.journal.level;
379         struct bkey_buf tmp;
380         unsigned nr = test_bit(BCH_FS_started, &c->flags)
381                 ? (level > 1 ? 0 :  2)
382                 : (level > 1 ? 1 : 16);
383
384         iter.prefetch = false;
385         bch2_bkey_buf_init(&tmp);
386
387         while (nr--) {
388                 bch2_btree_and_journal_iter_advance(&iter);
389                 struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
390                 if (!k.k)
391                         break;
392
393                 bch2_bkey_buf_reassemble(&tmp, c, k);
394                 bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
395         }
396
397         bch2_bkey_buf_exit(&tmp, c);
398 }
399
400 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
401 {
402         struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
403
404         if (iter->prefetch && iter->journal.level)
405                 btree_and_journal_iter_prefetch(iter);
406 again:
407         if (iter->at_end)
408                 return bkey_s_c_null;
409
410         while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
411                bpos_lt(btree_k.k->p, iter->pos))
412                 bch2_journal_iter_advance_btree(iter);
413
414         if (iter->trans->journal_replay_not_finished)
415                 while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
416                        bpos_lt(journal_k.k->p, iter->pos))
417                         bch2_journal_iter_advance(&iter->journal);
418
419         ret = journal_k.k &&
420                 (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
421                 ? journal_k
422                 : btree_k;
423
424         if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
425                 ret = bkey_s_c_null;
426
427         if (ret.k) {
428                 iter->pos = ret.k->p;
429                 if (bkey_deleted(ret.k)) {
430                         bch2_btree_and_journal_iter_advance(iter);
431                         goto again;
432                 }
433         } else {
434                 iter->pos = SPOS_MAX;
435                 iter->at_end = true;
436         }
437
438         return ret;
439 }
440
441 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
442 {
443         bch2_journal_iter_exit(&iter->journal);
444 }
445
446 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
447                                                   struct btree_and_journal_iter *iter,
448                                                   struct btree *b,
449                                                   struct btree_node_iter node_iter,
450                                                   struct bpos pos)
451 {
452         memset(iter, 0, sizeof(*iter));
453
454         iter->trans = trans;
455         iter->b = b;
456         iter->node_iter = node_iter;
457         iter->pos = b->data->min_key;
458         iter->at_end = false;
459         INIT_LIST_HEAD(&iter->journal.list);
460
461         if (trans->journal_replay_not_finished) {
462                 bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
463                 if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
464                         list_add(&iter->journal.list, &trans->c->journal_iters);
465         }
466 }
467
468 /*
469  * this version is used by btree_gc before filesystem has gone RW and
470  * multithreaded, so uses the journal_iters list:
471  */
472 void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
473                                                 struct btree_and_journal_iter *iter,
474                                                 struct btree *b)
475 {
476         struct btree_node_iter node_iter;
477
478         bch2_btree_node_iter_init_from_start(&node_iter, b);
479         __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
480 }
481
482 /* sort and dedup all keys in the journal: */
483
484 void bch2_journal_entries_free(struct bch_fs *c)
485 {
486         struct journal_replay **i;
487         struct genradix_iter iter;
488
489         genradix_for_each(&c->journal_entries, iter, i)
490                 kvfree(*i);
491         genradix_free(&c->journal_entries);
492 }
493
494 /*
495  * When keys compare equal, oldest compares first:
496  */
497 static int journal_sort_key_cmp(const void *_l, const void *_r)
498 {
499         const struct journal_key *l = _l;
500         const struct journal_key *r = _r;
501
502         return  journal_key_cmp(l, r) ?:
503                 cmp_int(l->journal_seq, r->journal_seq) ?:
504                 cmp_int(l->journal_offset, r->journal_offset);
505 }
506
507 void bch2_journal_keys_put(struct bch_fs *c)
508 {
509         struct journal_keys *keys = &c->journal_keys;
510
511         BUG_ON(atomic_read(&keys->ref) <= 0);
512
513         if (!atomic_dec_and_test(&keys->ref))
514                 return;
515
516         move_gap(keys, keys->nr);
517
518         darray_for_each(*keys, i)
519                 if (i->allocated)
520                         kfree(i->k);
521
522         kvfree(keys->data);
523         keys->data = NULL;
524         keys->nr = keys->gap = keys->size = 0;
525
526         bch2_journal_entries_free(c);
527 }
528
529 static void __journal_keys_sort(struct journal_keys *keys)
530 {
531         sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
532
533         cond_resched();
534
535         struct journal_key *dst = keys->data;
536
537         darray_for_each(*keys, src) {
538                 /*
539                  * We don't accumulate accounting keys here because we have to
540                  * compare each individual accounting key against the version in
541                  * the btree during replay:
542                  */
543                 if (src->k->k.type != KEY_TYPE_accounting &&
544                     src + 1 < &darray_top(*keys) &&
545                     !journal_key_cmp(src, src + 1))
546                         continue;
547
548                 *dst++ = *src;
549         }
550
551         keys->nr = dst - keys->data;
552 }
553
554 int bch2_journal_keys_sort(struct bch_fs *c)
555 {
556         struct genradix_iter iter;
557         struct journal_replay *i, **_i;
558         struct journal_keys *keys = &c->journal_keys;
559         size_t nr_read = 0;
560
561         genradix_for_each(&c->journal_entries, iter, _i) {
562                 i = *_i;
563
564                 if (journal_replay_ignore(i))
565                         continue;
566
567                 cond_resched();
568
569                 for_each_jset_key(k, entry, &i->j) {
570                         struct journal_key n = (struct journal_key) {
571                                 .btree_id       = entry->btree_id,
572                                 .level          = entry->level,
573                                 .k              = k,
574                                 .journal_seq    = le64_to_cpu(i->j.seq),
575                                 .journal_offset = k->_data - i->j._data,
576                         };
577
578                         if (darray_push(keys, n)) {
579                                 __journal_keys_sort(keys);
580
581                                 if (keys->nr * 8 > keys->size * 7) {
582                                         bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
583                                                 keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
584                                         return -BCH_ERR_ENOMEM_journal_keys_sort;
585                                 }
586
587                                 BUG_ON(darray_push(keys, n));
588                         }
589
590                         nr_read++;
591                 }
592         }
593
594         __journal_keys_sort(keys);
595         keys->gap = keys->nr;
596
597         bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
598         return 0;
599 }
600
601 void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
602                                   unsigned level_min, unsigned level_max,
603                                   struct bpos start, struct bpos end)
604 {
605         struct journal_keys *keys = &c->journal_keys;
606         size_t dst = 0;
607
608         move_gap(keys, keys->nr);
609
610         darray_for_each(*keys, i)
611                 if (!(i->btree_id == btree &&
612                       i->level >= level_min &&
613                       i->level <= level_max &&
614                       bpos_ge(i->k->k.p, start) &&
615                       bpos_le(i->k->k.p, end)))
616                         keys->data[dst++] = *i;
617         keys->nr = keys->gap = dst;
618 }
619
620 void bch2_journal_keys_dump(struct bch_fs *c)
621 {
622         struct journal_keys *keys = &c->journal_keys;
623         struct printbuf buf = PRINTBUF;
624
625         pr_info("%zu keys:", keys->nr);
626
627         move_gap(keys, keys->nr);
628
629         darray_for_each(*keys, i) {
630                 printbuf_reset(&buf);
631                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
632                 pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
633         }
634         printbuf_exit(&buf);
635 }
This page took 0.064828 seconds and 4 git commands to generate.