]> Git Repo - J-linux.git/blob - fs/bcachefs/btree_update_interior.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / fs / bcachefs / btree_update_interior.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_buf.h"
6 #include "bkey_methods.h"
7 #include "btree_cache.h"
8 #include "btree_gc.h"
9 #include "btree_journal_iter.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_io.h"
13 #include "btree_iter.h"
14 #include "btree_locking.h"
15 #include "buckets.h"
16 #include "clock.h"
17 #include "error.h"
18 #include "extents.h"
19 #include "io_write.h"
20 #include "journal.h"
21 #include "journal_reclaim.h"
22 #include "keylist.h"
23 #include "recovery_passes.h"
24 #include "replicas.h"
25 #include "sb-members.h"
26 #include "super-io.h"
27 #include "trace.h"
28
29 #include <linux/random.h>
30
31 static const char * const bch2_btree_update_modes[] = {
32 #define x(t) #t,
33         BTREE_UPDATE_MODES()
34 #undef x
35         NULL
36 };
37
38 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
39                                   btree_path_idx_t, struct btree *, struct keylist *);
40 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
41
42 /*
43  * Verify that child nodes correctly span parent node's range:
44  */
45 int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
46 {
47         struct bch_fs *c = trans->c;
48         struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
49                 ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
50                 : b->data->min_key;
51         struct btree_and_journal_iter iter;
52         struct bkey_s_c k;
53         struct printbuf buf = PRINTBUF;
54         struct bkey_buf prev;
55         int ret = 0;
56
57         BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
58                !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
59                         b->data->min_key));
60
61         if (b == btree_node_root(c, b)) {
62                 if (!bpos_eq(b->data->min_key, POS_MIN)) {
63                         printbuf_reset(&buf);
64                         bch2_bpos_to_text(&buf, b->data->min_key);
65                         need_fsck_err(trans, btree_root_bad_min_key,
66                                       "btree root with incorrect min_key: %s", buf.buf);
67                         goto topology_repair;
68                 }
69
70                 if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
71                         printbuf_reset(&buf);
72                         bch2_bpos_to_text(&buf, b->data->max_key);
73                         need_fsck_err(trans, btree_root_bad_max_key,
74                                       "btree root with incorrect max_key: %s", buf.buf);
75                         goto topology_repair;
76                 }
77         }
78
79         if (!b->c.level)
80                 return 0;
81
82         bch2_bkey_buf_init(&prev);
83         bkey_init(&prev.k->k);
84         bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
85
86         while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
87                 if (k.k->type != KEY_TYPE_btree_ptr_v2)
88                         goto out;
89
90                 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
91
92                 struct bpos expected_min = bkey_deleted(&prev.k->k)
93                         ? node_min
94                         : bpos_successor(prev.k->k.p);
95
96                 if (!bpos_eq(expected_min, bp.v->min_key)) {
97                         bch2_topology_error(c);
98
99                         printbuf_reset(&buf);
100                         prt_str(&buf, "end of prev node doesn't match start of next node\n"),
101                         prt_printf(&buf, "  in btree %s level %u node ",
102                                    bch2_btree_id_str(b->c.btree_id), b->c.level);
103                         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
104                         prt_str(&buf, "\n  prev ");
105                         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
106                         prt_str(&buf, "\n  next ");
107                         bch2_bkey_val_to_text(&buf, c, k);
108
109                         need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
110                         goto topology_repair;
111                 }
112
113                 bch2_bkey_buf_reassemble(&prev, c, k);
114                 bch2_btree_and_journal_iter_advance(&iter);
115         }
116
117         if (bkey_deleted(&prev.k->k)) {
118                 bch2_topology_error(c);
119
120                 printbuf_reset(&buf);
121                 prt_str(&buf, "empty interior node\n");
122                 prt_printf(&buf, "  in btree %s level %u node ",
123                            bch2_btree_id_str(b->c.btree_id), b->c.level);
124                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
125
126                 need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
127                 goto topology_repair;
128         } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
129                 bch2_topology_error(c);
130
131                 printbuf_reset(&buf);
132                 prt_str(&buf, "last child node doesn't end at end of parent node\n");
133                 prt_printf(&buf, "  in btree %s level %u node ",
134                            bch2_btree_id_str(b->c.btree_id), b->c.level);
135                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
136                 prt_str(&buf, "\n  last key ");
137                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
138
139                 need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
140                 goto topology_repair;
141         }
142 out:
143 fsck_err:
144         bch2_btree_and_journal_iter_exit(&iter);
145         bch2_bkey_buf_exit(&prev, c);
146         printbuf_exit(&buf);
147         return ret;
148 topology_repair:
149         if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
150             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
151                 bch2_inconsistent_error(c);
152                 ret = -BCH_ERR_btree_need_topology_repair;
153         } else {
154                 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
155         }
156         goto out;
157 }
158
159 /* Calculate ideal packed bkey format for new btree nodes: */
160
161 static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
162 {
163         struct bkey_packed *k;
164         struct bkey uk;
165
166         for_each_bset(b, t)
167                 bset_tree_for_each_key(b, t, k)
168                         if (!bkey_deleted(k)) {
169                                 uk = bkey_unpack_key(b, k);
170                                 bch2_bkey_format_add_key(s, &uk);
171                         }
172 }
173
174 static struct bkey_format bch2_btree_calc_format(struct btree *b)
175 {
176         struct bkey_format_state s;
177
178         bch2_bkey_format_init(&s);
179         bch2_bkey_format_add_pos(&s, b->data->min_key);
180         bch2_bkey_format_add_pos(&s, b->data->max_key);
181         __bch2_btree_calc_format(&s, b);
182
183         return bch2_bkey_format_done(&s);
184 }
185
186 static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
187                                           struct bkey_format *old_f,
188                                           struct bkey_format *new_f)
189 {
190         /* stupid integer promotion rules */
191         ssize_t delta =
192             (((int) new_f->key_u64s - old_f->key_u64s) *
193              (int) nr.packed_keys) +
194             (((int) new_f->key_u64s - BKEY_U64s) *
195              (int) nr.unpacked_keys);
196
197         BUG_ON(delta + nr.live_u64s < 0);
198
199         return nr.live_u64s + delta;
200 }
201
202 /**
203  * bch2_btree_node_format_fits - check if we could rewrite node with a new format
204  *
205  * @c:          filesystem handle
206  * @b:          btree node to rewrite
207  * @nr:         number of keys for new node (i.e. b->nr)
208  * @new_f:      bkey format to translate keys to
209  *
210  * Returns: true if all re-packed keys will be able to fit in a new node.
211  *
212  * Assumes all keys will successfully pack with the new format.
213  */
214 static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
215                                  struct btree_nr_keys nr,
216                                  struct bkey_format *new_f)
217 {
218         size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
219
220         return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
221 }
222
223 /* Btree node freeing/allocation: */
224
225 static void __btree_node_free(struct btree_trans *trans, struct btree *b)
226 {
227         struct bch_fs *c = trans->c;
228
229         trace_and_count(c, btree_node_free, trans, b);
230
231         BUG_ON(btree_node_write_blocked(b));
232         BUG_ON(btree_node_dirty(b));
233         BUG_ON(btree_node_need_write(b));
234         BUG_ON(b == btree_node_root(c, b));
235         BUG_ON(b->ob.nr);
236         BUG_ON(!list_empty(&b->write_blocked));
237         BUG_ON(b->will_make_reachable);
238
239         clear_btree_node_noevict(b);
240 }
241
242 static void bch2_btree_node_free_inmem(struct btree_trans *trans,
243                                        struct btree_path *path,
244                                        struct btree *b)
245 {
246         struct bch_fs *c = trans->c;
247         unsigned i, level = b->c.level;
248
249         bch2_btree_node_lock_write_nofail(trans, path, &b->c);
250
251         __btree_node_free(trans, b);
252
253         mutex_lock(&c->btree_cache.lock);
254         bch2_btree_node_hash_remove(&c->btree_cache, b);
255         mutex_unlock(&c->btree_cache.lock);
256
257         six_unlock_write(&b->c.lock);
258         mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
259
260         trans_for_each_path(trans, path, i)
261                 if (path->l[level].b == b) {
262                         btree_node_unlock(trans, path, level);
263                         path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
264                 }
265 }
266
267 static void bch2_btree_node_free_never_used(struct btree_update *as,
268                                             struct btree_trans *trans,
269                                             struct btree *b)
270 {
271         struct bch_fs *c = as->c;
272         struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
273         struct btree_path *path;
274         unsigned i, level = b->c.level;
275
276         BUG_ON(!list_empty(&b->write_blocked));
277         BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
278
279         b->will_make_reachable = 0;
280         closure_put(&as->cl);
281
282         clear_btree_node_will_make_reachable(b);
283         clear_btree_node_accessed(b);
284         clear_btree_node_dirty_acct(c, b);
285         clear_btree_node_need_write(b);
286
287         mutex_lock(&c->btree_cache.lock);
288         __bch2_btree_node_hash_remove(&c->btree_cache, b);
289         mutex_unlock(&c->btree_cache.lock);
290
291         BUG_ON(p->nr >= ARRAY_SIZE(p->b));
292         p->b[p->nr++] = b;
293
294         six_unlock_intent(&b->c.lock);
295
296         trans_for_each_path(trans, path, i)
297                 if (path->l[level].b == b) {
298                         btree_node_unlock(trans, path, level);
299                         path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
300                 }
301 }
302
303 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
304                                              struct disk_reservation *res,
305                                              struct closure *cl,
306                                              bool interior_node,
307                                              unsigned flags)
308 {
309         struct bch_fs *c = trans->c;
310         struct write_point *wp;
311         struct btree *b;
312         BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
313         struct open_buckets obs = { .nr = 0 };
314         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
315         enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
316         unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
317                 ? BTREE_NODE_RESERVE
318                 : 0;
319         int ret;
320
321         b = bch2_btree_node_mem_alloc(trans, interior_node);
322         if (IS_ERR(b))
323                 return b;
324
325         BUG_ON(b->ob.nr);
326
327         mutex_lock(&c->btree_reserve_cache_lock);
328         if (c->btree_reserve_cache_nr > nr_reserve) {
329                 struct btree_alloc *a =
330                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
331
332                 obs = a->ob;
333                 bkey_copy(&tmp.k, &a->k);
334                 mutex_unlock(&c->btree_reserve_cache_lock);
335                 goto out;
336         }
337         mutex_unlock(&c->btree_reserve_cache_lock);
338 retry:
339         ret = bch2_alloc_sectors_start_trans(trans,
340                                       c->opts.metadata_target ?:
341                                       c->opts.foreground_target,
342                                       0,
343                                       writepoint_ptr(&c->btree_write_point),
344                                       &devs_have,
345                                       res->nr_replicas,
346                                       min(res->nr_replicas,
347                                           c->opts.metadata_replicas_required),
348                                       watermark, 0, cl, &wp);
349         if (unlikely(ret))
350                 goto err;
351
352         if (wp->sectors_free < btree_sectors(c)) {
353                 struct open_bucket *ob;
354                 unsigned i;
355
356                 open_bucket_for_each(c, &wp->ptrs, ob, i)
357                         if (ob->sectors_free < btree_sectors(c))
358                                 ob->sectors_free = 0;
359
360                 bch2_alloc_sectors_done(c, wp);
361                 goto retry;
362         }
363
364         bkey_btree_ptr_v2_init(&tmp.k);
365         bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
366
367         bch2_open_bucket_get(c, wp, &obs);
368         bch2_alloc_sectors_done(c, wp);
369 out:
370         bkey_copy(&b->key, &tmp.k);
371         b->ob = obs;
372         six_unlock_write(&b->c.lock);
373         six_unlock_intent(&b->c.lock);
374
375         return b;
376 err:
377         bch2_btree_node_to_freelist(c, b);
378         return ERR_PTR(ret);
379 }
380
381 static struct btree *bch2_btree_node_alloc(struct btree_update *as,
382                                            struct btree_trans *trans,
383                                            unsigned level)
384 {
385         struct bch_fs *c = as->c;
386         struct btree *b;
387         struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
388         int ret;
389
390         BUG_ON(level >= BTREE_MAX_DEPTH);
391         BUG_ON(!p->nr);
392
393         b = p->b[--p->nr];
394
395         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
396         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
397
398         set_btree_node_accessed(b);
399         set_btree_node_dirty_acct(c, b);
400         set_btree_node_need_write(b);
401
402         bch2_bset_init_first(b, &b->data->keys);
403         b->c.level      = level;
404         b->c.btree_id   = as->btree_id;
405         b->version_ondisk = c->sb.version;
406
407         memset(&b->nr, 0, sizeof(b->nr));
408         b->data->magic = cpu_to_le64(bset_magic(c));
409         memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
410         b->data->flags = 0;
411         SET_BTREE_NODE_ID(b->data, as->btree_id);
412         SET_BTREE_NODE_LEVEL(b->data, level);
413
414         if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
415                 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
416
417                 bp->v.mem_ptr           = 0;
418                 bp->v.seq               = b->data->keys.seq;
419                 bp->v.sectors_written   = 0;
420         }
421
422         SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
423
424         bch2_btree_build_aux_trees(b);
425
426         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
427         BUG_ON(ret);
428
429         trace_and_count(c, btree_node_alloc, trans, b);
430         bch2_increment_clock(c, btree_sectors(c), WRITE);
431         return b;
432 }
433
434 static void btree_set_min(struct btree *b, struct bpos pos)
435 {
436         if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
437                 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
438         b->data->min_key = pos;
439 }
440
441 static void btree_set_max(struct btree *b, struct bpos pos)
442 {
443         b->key.k.p = pos;
444         b->data->max_key = pos;
445 }
446
447 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
448                                                        struct btree_trans *trans,
449                                                        struct btree *b)
450 {
451         struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
452         struct bkey_format format = bch2_btree_calc_format(b);
453
454         /*
455          * The keys might expand with the new format - if they wouldn't fit in
456          * the btree node anymore, use the old format for now:
457          */
458         if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
459                 format = b->format;
460
461         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
462
463         btree_set_min(n, b->data->min_key);
464         btree_set_max(n, b->data->max_key);
465
466         n->data->format         = format;
467         btree_node_set_format(n, format);
468
469         bch2_btree_sort_into(as->c, n, b);
470
471         btree_node_reset_sib_u64s(n);
472         return n;
473 }
474
475 static struct btree *__btree_root_alloc(struct btree_update *as,
476                                 struct btree_trans *trans, unsigned level)
477 {
478         struct btree *b = bch2_btree_node_alloc(as, trans, level);
479
480         btree_set_min(b, POS_MIN);
481         btree_set_max(b, SPOS_MAX);
482         b->data->format = bch2_btree_calc_format(b);
483
484         btree_node_set_format(b, b->data->format);
485         bch2_btree_build_aux_trees(b);
486
487         return b;
488 }
489
490 static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
491 {
492         struct bch_fs *c = as->c;
493         struct prealloc_nodes *p;
494
495         for (p = as->prealloc_nodes;
496              p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
497              p++) {
498                 while (p->nr) {
499                         struct btree *b = p->b[--p->nr];
500
501                         mutex_lock(&c->btree_reserve_cache_lock);
502
503                         if (c->btree_reserve_cache_nr <
504                             ARRAY_SIZE(c->btree_reserve_cache)) {
505                                 struct btree_alloc *a =
506                                         &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
507
508                                 a->ob = b->ob;
509                                 b->ob.nr = 0;
510                                 bkey_copy(&a->k, &b->key);
511                         } else {
512                                 bch2_open_buckets_put(c, &b->ob);
513                         }
514
515                         mutex_unlock(&c->btree_reserve_cache_lock);
516
517                         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
518                         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
519                         __btree_node_free(trans, b);
520                         bch2_btree_node_to_freelist(c, b);
521                 }
522         }
523 }
524
525 static int bch2_btree_reserve_get(struct btree_trans *trans,
526                                   struct btree_update *as,
527                                   unsigned nr_nodes[2],
528                                   unsigned flags,
529                                   struct closure *cl)
530 {
531         struct btree *b;
532         unsigned interior;
533         int ret = 0;
534
535         BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
536
537         /*
538          * Protects reaping from the btree node cache and using the btree node
539          * open bucket reserve:
540          */
541         ret = bch2_btree_cache_cannibalize_lock(trans, cl);
542         if (ret)
543                 return ret;
544
545         for (interior = 0; interior < 2; interior++) {
546                 struct prealloc_nodes *p = as->prealloc_nodes + interior;
547
548                 while (p->nr < nr_nodes[interior]) {
549                         b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
550                                                     interior, flags);
551                         if (IS_ERR(b)) {
552                                 ret = PTR_ERR(b);
553                                 goto err;
554                         }
555
556                         p->b[p->nr++] = b;
557                 }
558         }
559 err:
560         bch2_btree_cache_cannibalize_unlock(trans);
561         return ret;
562 }
563
564 /* Asynchronous interior node update machinery */
565
566 static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
567 {
568         struct bch_fs *c = as->c;
569
570         if (as->took_gc_lock)
571                 up_read(&c->gc_lock);
572         as->took_gc_lock = false;
573
574         bch2_journal_pin_drop(&c->journal, &as->journal);
575         bch2_journal_pin_flush(&c->journal, &as->journal);
576         bch2_disk_reservation_put(c, &as->disk_res);
577         bch2_btree_reserve_put(as, trans);
578
579         bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
580                                as->start_time);
581
582         mutex_lock(&c->btree_interior_update_lock);
583         list_del(&as->unwritten_list);
584         list_del(&as->list);
585
586         closure_debug_destroy(&as->cl);
587         mempool_free(as, &c->btree_interior_update_pool);
588
589         /*
590          * Have to do the wakeup with btree_interior_update_lock still held,
591          * since being on btree_interior_update_list is our ref on @c:
592          */
593         closure_wake_up(&c->btree_interior_update_wait);
594
595         mutex_unlock(&c->btree_interior_update_lock);
596 }
597
598 static void btree_update_add_key(struct btree_update *as,
599                                  struct keylist *keys, struct btree *b)
600 {
601         struct bkey_i *k = &b->key;
602
603         BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
604                ARRAY_SIZE(as->_old_keys));
605
606         bkey_copy(keys->top, k);
607         bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
608
609         bch2_keylist_push(keys);
610 }
611
612 static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
613 {
614         for_each_keylist_key(&as->new_keys, k)
615                 if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
616                         return false;
617         return true;
618 }
619
620 static void btree_update_new_nodes_mark_sb(struct btree_update *as)
621 {
622         struct bch_fs *c = as->c;
623
624         mutex_lock(&c->sb_lock);
625         for_each_keylist_key(&as->new_keys, k)
626                 bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
627
628         bch2_write_super(c);
629         mutex_unlock(&c->sb_lock);
630 }
631
632 /*
633  * The transactional part of an interior btree node update, where we journal the
634  * update we did to the interior node and update alloc info:
635  */
636 static int btree_update_nodes_written_trans(struct btree_trans *trans,
637                                             struct btree_update *as)
638 {
639         struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
640         int ret = PTR_ERR_OR_ZERO(e);
641         if (ret)
642                 return ret;
643
644         memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
645
646         trans->journal_pin = &as->journal;
647
648         for_each_keylist_key(&as->old_keys, k) {
649                 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
650
651                 ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
652                                            BTREE_TRIGGER_transactional);
653                 if (ret)
654                         return ret;
655         }
656
657         for_each_keylist_key(&as->new_keys, k) {
658                 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
659
660                 ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
661                                            BTREE_TRIGGER_transactional);
662                 if (ret)
663                         return ret;
664         }
665
666         return 0;
667 }
668
669 static void btree_update_nodes_written(struct btree_update *as)
670 {
671         struct bch_fs *c = as->c;
672         struct btree *b;
673         struct btree_trans *trans = bch2_trans_get(c);
674         u64 journal_seq = 0;
675         unsigned i;
676         int ret;
677
678         /*
679          * If we're already in an error state, it might be because a btree node
680          * was never written, and we might be trying to free that same btree
681          * node here, but it won't have been marked as allocated and we'll see
682          * spurious disk usage inconsistencies in the transactional part below
683          * if we don't skip it:
684          */
685         ret = bch2_journal_error(&c->journal);
686         if (ret)
687                 goto err;
688
689         if (!btree_update_new_nodes_marked_sb(as))
690                 btree_update_new_nodes_mark_sb(as);
691
692         /*
693          * Wait for any in flight writes to finish before we free the old nodes
694          * on disk:
695          */
696         for (i = 0; i < as->nr_old_nodes; i++) {
697                 __le64 seq;
698
699                 b = as->old_nodes[i];
700
701                 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
702                 seq = b->data ? b->data->keys.seq : 0;
703                 six_unlock_read(&b->c.lock);
704
705                 if (seq == as->old_nodes_seq[i])
706                         wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
707                                        TASK_UNINTERRUPTIBLE);
708         }
709
710         /*
711          * We did an update to a parent node where the pointers we added pointed
712          * to child nodes that weren't written yet: now, the child nodes have
713          * been written so we can write out the update to the interior node.
714          */
715
716         /*
717          * We can't call into journal reclaim here: we'd block on the journal
718          * reclaim lock, but we may need to release the open buckets we have
719          * pinned in order for other btree updates to make forward progress, and
720          * journal reclaim does btree updates when flushing bkey_cached entries,
721          * which may require allocations as well.
722          */
723         ret = commit_do(trans, &as->disk_res, &journal_seq,
724                         BCH_WATERMARK_interior_updates|
725                         BCH_TRANS_COMMIT_no_enospc|
726                         BCH_TRANS_COMMIT_no_check_rw|
727                         BCH_TRANS_COMMIT_journal_reclaim,
728                         btree_update_nodes_written_trans(trans, as));
729         bch2_trans_unlock(trans);
730
731         bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
732                              "%s", bch2_err_str(ret));
733 err:
734         /*
735          * Ensure transaction is unlocked before using btree_node_lock_nopath()
736          * (the use of which is always suspect, we need to work on removing this
737          * in the future)
738          *
739          * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
740          * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
741          * rarely end up with a locked path besides the one we have here:
742          */
743         bch2_trans_unlock(trans);
744         bch2_trans_begin(trans);
745
746         /*
747          * We have to be careful because another thread might be getting ready
748          * to free as->b and calling btree_update_reparent() on us - we'll
749          * recheck under btree_update_lock below:
750          */
751         b = READ_ONCE(as->b);
752         if (b) {
753                 /*
754                  * @b is the node we did the final insert into:
755                  *
756                  * On failure to get a journal reservation, we still have to
757                  * unblock the write and allow most of the write path to happen
758                  * so that shutdown works, but the i->journal_seq mechanism
759                  * won't work to prevent the btree write from being visible (we
760                  * didn't get a journal sequence number) - instead
761                  * __bch2_btree_node_write() doesn't do the actual write if
762                  * we're in journal error state:
763                  */
764
765                 btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
766                                                 as->btree_id, b->c.level, b->key.k.p);
767                 struct btree_path *path = trans->paths + path_idx;
768                 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
769                 mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
770                 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
771                 path->l[b->c.level].b = b;
772
773                 bch2_btree_node_lock_write_nofail(trans, path, &b->c);
774
775                 mutex_lock(&c->btree_interior_update_lock);
776
777                 list_del(&as->write_blocked_list);
778                 if (list_empty(&b->write_blocked))
779                         clear_btree_node_write_blocked(b);
780
781                 /*
782                  * Node might have been freed, recheck under
783                  * btree_interior_update_lock:
784                  */
785                 if (as->b == b) {
786                         BUG_ON(!b->c.level);
787                         BUG_ON(!btree_node_dirty(b));
788
789                         if (!ret) {
790                                 struct bset *last = btree_bset_last(b);
791
792                                 last->journal_seq = cpu_to_le64(
793                                                              max(journal_seq,
794                                                                  le64_to_cpu(last->journal_seq)));
795
796                                 bch2_btree_add_journal_pin(c, b, journal_seq);
797                         } else {
798                                 /*
799                                  * If we didn't get a journal sequence number we
800                                  * can't write this btree node, because recovery
801                                  * won't know to ignore this write:
802                                  */
803                                 set_btree_node_never_write(b);
804                         }
805                 }
806
807                 mutex_unlock(&c->btree_interior_update_lock);
808
809                 mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
810                 six_unlock_write(&b->c.lock);
811
812                 btree_node_write_if_need(c, b, SIX_LOCK_intent);
813                 btree_node_unlock(trans, path, b->c.level);
814                 bch2_path_put(trans, path_idx, true);
815         }
816
817         bch2_journal_pin_drop(&c->journal, &as->journal);
818
819         mutex_lock(&c->btree_interior_update_lock);
820         for (i = 0; i < as->nr_new_nodes; i++) {
821                 b = as->new_nodes[i];
822
823                 BUG_ON(b->will_make_reachable != (unsigned long) as);
824                 b->will_make_reachable = 0;
825                 clear_btree_node_will_make_reachable(b);
826         }
827         mutex_unlock(&c->btree_interior_update_lock);
828
829         for (i = 0; i < as->nr_new_nodes; i++) {
830                 b = as->new_nodes[i];
831
832                 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
833                 btree_node_write_if_need(c, b, SIX_LOCK_read);
834                 six_unlock_read(&b->c.lock);
835         }
836
837         for (i = 0; i < as->nr_open_buckets; i++)
838                 bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
839
840         bch2_btree_update_free(as, trans);
841         bch2_trans_put(trans);
842 }
843
844 static void btree_interior_update_work(struct work_struct *work)
845 {
846         struct bch_fs *c =
847                 container_of(work, struct bch_fs, btree_interior_update_work);
848         struct btree_update *as;
849
850         while (1) {
851                 mutex_lock(&c->btree_interior_update_lock);
852                 as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
853                                               struct btree_update, unwritten_list);
854                 if (as && !as->nodes_written)
855                         as = NULL;
856                 mutex_unlock(&c->btree_interior_update_lock);
857
858                 if (!as)
859                         break;
860
861                 btree_update_nodes_written(as);
862         }
863 }
864
865 static CLOSURE_CALLBACK(btree_update_set_nodes_written)
866 {
867         closure_type(as, struct btree_update, cl);
868         struct bch_fs *c = as->c;
869
870         mutex_lock(&c->btree_interior_update_lock);
871         as->nodes_written = true;
872         mutex_unlock(&c->btree_interior_update_lock);
873
874         queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
875 }
876
877 /*
878  * We're updating @b with pointers to nodes that haven't finished writing yet:
879  * block @b from being written until @as completes
880  */
881 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
882 {
883         struct bch_fs *c = as->c;
884
885         BUG_ON(as->mode != BTREE_UPDATE_none);
886         BUG_ON(as->update_level_end < b->c.level);
887         BUG_ON(!btree_node_dirty(b));
888         BUG_ON(!b->c.level);
889
890         mutex_lock(&c->btree_interior_update_lock);
891         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
892
893         as->mode        = BTREE_UPDATE_node;
894         as->b           = b;
895         as->update_level_end = b->c.level;
896
897         set_btree_node_write_blocked(b);
898         list_add(&as->write_blocked_list, &b->write_blocked);
899
900         mutex_unlock(&c->btree_interior_update_lock);
901 }
902
903 static int bch2_update_reparent_journal_pin_flush(struct journal *j,
904                                 struct journal_entry_pin *_pin, u64 seq)
905 {
906         return 0;
907 }
908
909 static void btree_update_reparent(struct btree_update *as,
910                                   struct btree_update *child)
911 {
912         struct bch_fs *c = as->c;
913
914         lockdep_assert_held(&c->btree_interior_update_lock);
915
916         child->b = NULL;
917         child->mode = BTREE_UPDATE_update;
918
919         bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
920                               bch2_update_reparent_journal_pin_flush);
921 }
922
923 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
924 {
925         struct bkey_i *insert = &b->key;
926         struct bch_fs *c = as->c;
927
928         BUG_ON(as->mode != BTREE_UPDATE_none);
929
930         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
931                ARRAY_SIZE(as->journal_entries));
932
933         as->journal_u64s +=
934                 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
935                                   BCH_JSET_ENTRY_btree_root,
936                                   b->c.btree_id, b->c.level,
937                                   insert, insert->k.u64s);
938
939         mutex_lock(&c->btree_interior_update_lock);
940         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
941
942         as->mode        = BTREE_UPDATE_root;
943         mutex_unlock(&c->btree_interior_update_lock);
944 }
945
946 /*
947  * bch2_btree_update_add_new_node:
948  *
949  * This causes @as to wait on @b to be written, before it gets to
950  * bch2_btree_update_nodes_written
951  *
952  * Additionally, it sets b->will_make_reachable to prevent any additional writes
953  * to @b from happening besides the first until @b is reachable on disk
954  *
955  * And it adds @b to the list of @as's new nodes, so that we can update sector
956  * counts in bch2_btree_update_nodes_written:
957  */
958 static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
959 {
960         struct bch_fs *c = as->c;
961
962         closure_get(&as->cl);
963
964         mutex_lock(&c->btree_interior_update_lock);
965         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
966         BUG_ON(b->will_make_reachable);
967
968         as->new_nodes[as->nr_new_nodes++] = b;
969         b->will_make_reachable = 1UL|(unsigned long) as;
970         set_btree_node_will_make_reachable(b);
971
972         mutex_unlock(&c->btree_interior_update_lock);
973
974         btree_update_add_key(as, &as->new_keys, b);
975
976         if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
977                 unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
978                 unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
979
980                 bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
981                         cpu_to_le16(sectors);
982         }
983 }
984
985 /*
986  * returns true if @b was a new node
987  */
988 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
989 {
990         struct btree_update *as;
991         unsigned long v;
992         unsigned i;
993
994         mutex_lock(&c->btree_interior_update_lock);
995         /*
996          * When b->will_make_reachable != 0, it owns a ref on as->cl that's
997          * dropped when it gets written by bch2_btree_complete_write - the
998          * xchg() is for synchronization with bch2_btree_complete_write:
999          */
1000         v = xchg(&b->will_make_reachable, 0);
1001         clear_btree_node_will_make_reachable(b);
1002         as = (struct btree_update *) (v & ~1UL);
1003
1004         if (!as) {
1005                 mutex_unlock(&c->btree_interior_update_lock);
1006                 return;
1007         }
1008
1009         for (i = 0; i < as->nr_new_nodes; i++)
1010                 if (as->new_nodes[i] == b)
1011                         goto found;
1012
1013         BUG();
1014 found:
1015         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
1016         mutex_unlock(&c->btree_interior_update_lock);
1017
1018         if (v & 1)
1019                 closure_put(&as->cl);
1020 }
1021
1022 static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
1023 {
1024         while (b->ob.nr)
1025                 as->open_buckets[as->nr_open_buckets++] =
1026                         b->ob.v[--b->ob.nr];
1027 }
1028
1029 static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
1030                                 struct journal_entry_pin *_pin, u64 seq)
1031 {
1032         return 0;
1033 }
1034
1035 /*
1036  * @b is being split/rewritten: it may have pointers to not-yet-written btree
1037  * nodes and thus outstanding btree_updates - redirect @b's
1038  * btree_updates to point to this btree_update:
1039  */
1040 static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
1041                                                       struct btree *b)
1042 {
1043         struct bch_fs *c = as->c;
1044         struct btree_update *p, *n;
1045         struct btree_write *w;
1046
1047         set_btree_node_dying(b);
1048
1049         if (btree_node_fake(b))
1050                 return;
1051
1052         mutex_lock(&c->btree_interior_update_lock);
1053
1054         /*
1055          * Does this node have any btree_update operations preventing
1056          * it from being written?
1057          *
1058          * If so, redirect them to point to this btree_update: we can
1059          * write out our new nodes, but we won't make them visible until those
1060          * operations complete
1061          */
1062         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
1063                 list_del_init(&p->write_blocked_list);
1064                 btree_update_reparent(as, p);
1065
1066                 /*
1067                  * for flush_held_btree_writes() waiting on updates to flush or
1068                  * nodes to be writeable:
1069                  */
1070                 closure_wake_up(&c->btree_interior_update_wait);
1071         }
1072
1073         clear_btree_node_dirty_acct(c, b);
1074         clear_btree_node_need_write(b);
1075         clear_btree_node_write_blocked(b);
1076
1077         /*
1078          * Does this node have unwritten data that has a pin on the journal?
1079          *
1080          * If so, transfer that pin to the btree_update operation -
1081          * note that if we're freeing multiple nodes, we only need to keep the
1082          * oldest pin of any of the nodes we're freeing. We'll release the pin
1083          * when the new nodes are persistent and reachable on disk:
1084          */
1085         w = btree_current_write(b);
1086         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
1087                               bch2_btree_update_will_free_node_journal_pin_flush);
1088         bch2_journal_pin_drop(&c->journal, &w->journal);
1089
1090         w = btree_prev_write(b);
1091         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
1092                               bch2_btree_update_will_free_node_journal_pin_flush);
1093         bch2_journal_pin_drop(&c->journal, &w->journal);
1094
1095         mutex_unlock(&c->btree_interior_update_lock);
1096
1097         /*
1098          * Is this a node that isn't reachable on disk yet?
1099          *
1100          * Nodes that aren't reachable yet have writes blocked until they're
1101          * reachable - now that we've cancelled any pending writes and moved
1102          * things waiting on that write to wait on this update, we can drop this
1103          * node from the list of nodes that the other update is making
1104          * reachable, prior to freeing it:
1105          */
1106         btree_update_drop_new_node(c, b);
1107
1108         btree_update_add_key(as, &as->old_keys, b);
1109
1110         as->old_nodes[as->nr_old_nodes] = b;
1111         as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
1112         as->nr_old_nodes++;
1113 }
1114
1115 static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
1116 {
1117         struct bch_fs *c = as->c;
1118         u64 start_time = as->start_time;
1119
1120         BUG_ON(as->mode == BTREE_UPDATE_none);
1121
1122         if (as->took_gc_lock)
1123                 up_read(&as->c->gc_lock);
1124         as->took_gc_lock = false;
1125
1126         bch2_btree_reserve_put(as, trans);
1127
1128         continue_at(&as->cl, btree_update_set_nodes_written,
1129                     as->c->btree_interior_update_worker);
1130
1131         bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
1132                                start_time);
1133 }
1134
1135 static struct btree_update *
1136 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
1137                         unsigned level_start, bool split, unsigned flags)
1138 {
1139         struct bch_fs *c = trans->c;
1140         struct btree_update *as;
1141         u64 start_time = local_clock();
1142         int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
1143                 ? BCH_DISK_RESERVATION_NOFAIL : 0;
1144         unsigned nr_nodes[2] = { 0, 0 };
1145         unsigned level_end = level_start;
1146         enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
1147         int ret = 0;
1148         u32 restart_count = trans->restart_count;
1149
1150         BUG_ON(!path->should_be_locked);
1151
1152         if (watermark == BCH_WATERMARK_copygc)
1153                 watermark = BCH_WATERMARK_btree_copygc;
1154         if (watermark < BCH_WATERMARK_btree)
1155                 watermark = BCH_WATERMARK_btree;
1156
1157         flags &= ~BCH_WATERMARK_MASK;
1158         flags |= watermark;
1159
1160         if (watermark < BCH_WATERMARK_reclaim &&
1161             test_bit(JOURNAL_space_low, &c->journal.flags)) {
1162                 if (flags & BCH_TRANS_COMMIT_journal_reclaim)
1163                         return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
1164
1165                 ret = drop_locks_do(trans,
1166                         ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
1167                 if (ret)
1168                         return ERR_PTR(ret);
1169         }
1170
1171         while (1) {
1172                 nr_nodes[!!level_end] += 1 + split;
1173                 level_end++;
1174
1175                 ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
1176                 if (ret)
1177                         return ERR_PTR(ret);
1178
1179                 if (!btree_path_node(path, level_end)) {
1180                         /* Allocating new root? */
1181                         nr_nodes[1] += split;
1182                         level_end = BTREE_MAX_DEPTH;
1183                         break;
1184                 }
1185
1186                 /*
1187                  * Always check for space for two keys, even if we won't have to
1188                  * split at prior level - it might have been a merge instead:
1189                  */
1190                 if (bch2_btree_node_insert_fits(path->l[level_end].b,
1191                                                 BKEY_BTREE_PTR_U64s_MAX * 2))
1192                         break;
1193
1194                 split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
1195         }
1196
1197         if (!down_read_trylock(&c->gc_lock)) {
1198                 ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
1199                 if (ret) {
1200                         up_read(&c->gc_lock);
1201                         return ERR_PTR(ret);
1202                 }
1203         }
1204
1205         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
1206         memset(as, 0, sizeof(*as));
1207         closure_init(&as->cl, NULL);
1208         as->c                   = c;
1209         as->start_time          = start_time;
1210         as->ip_started          = _RET_IP_;
1211         as->mode                = BTREE_UPDATE_none;
1212         as->flags               = flags;
1213         as->took_gc_lock        = true;
1214         as->btree_id            = path->btree_id;
1215         as->update_level_start  = level_start;
1216         as->update_level_end    = level_end;
1217         INIT_LIST_HEAD(&as->list);
1218         INIT_LIST_HEAD(&as->unwritten_list);
1219         INIT_LIST_HEAD(&as->write_blocked_list);
1220         bch2_keylist_init(&as->old_keys, as->_old_keys);
1221         bch2_keylist_init(&as->new_keys, as->_new_keys);
1222         bch2_keylist_init(&as->parent_keys, as->inline_keys);
1223
1224         mutex_lock(&c->btree_interior_update_lock);
1225         list_add_tail(&as->list, &c->btree_interior_update_list);
1226         mutex_unlock(&c->btree_interior_update_lock);
1227
1228         /*
1229          * We don't want to allocate if we're in an error state, that can cause
1230          * deadlock on emergency shutdown due to open buckets getting stuck in
1231          * the btree_reserve_cache after allocator shutdown has cleared it out.
1232          * This check needs to come after adding us to the btree_interior_update
1233          * list but before calling bch2_btree_reserve_get, to synchronize with
1234          * __bch2_fs_read_only().
1235          */
1236         ret = bch2_journal_error(&c->journal);
1237         if (ret)
1238                 goto err;
1239
1240         ret = bch2_disk_reservation_get(c, &as->disk_res,
1241                         (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
1242                         c->opts.metadata_replicas,
1243                         disk_res_flags);
1244         if (ret)
1245                 goto err;
1246
1247         ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
1248         if (bch2_err_matches(ret, ENOSPC) ||
1249             bch2_err_matches(ret, ENOMEM)) {
1250                 struct closure cl;
1251
1252                 /*
1253                  * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
1254                  * flag
1255                  */
1256                 if (bch2_err_matches(ret, ENOSPC) &&
1257                     (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
1258                     watermark < BCH_WATERMARK_reclaim) {
1259                         ret = -BCH_ERR_journal_reclaim_would_deadlock;
1260                         goto err;
1261                 }
1262
1263                 closure_init_stack(&cl);
1264
1265                 do {
1266                         ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
1267
1268                         bch2_trans_unlock(trans);
1269                         bch2_wait_on_allocator(c, &cl);
1270                 } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
1271         }
1272
1273         if (ret) {
1274                 trace_and_count(c, btree_reserve_get_fail, trans->fn,
1275                                 _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
1276                 goto err;
1277         }
1278
1279         ret = bch2_trans_relock(trans);
1280         if (ret)
1281                 goto err;
1282
1283         bch2_trans_verify_not_restarted(trans, restart_count);
1284         return as;
1285 err:
1286         bch2_btree_update_free(as, trans);
1287         if (!bch2_err_matches(ret, ENOSPC) &&
1288             !bch2_err_matches(ret, EROFS) &&
1289             ret != -BCH_ERR_journal_reclaim_would_deadlock)
1290                 bch_err_fn_ratelimited(c, ret);
1291         return ERR_PTR(ret);
1292 }
1293
1294 /* Btree root updates: */
1295
1296 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1297 {
1298         /* Root nodes cannot be reaped */
1299         mutex_lock(&c->btree_cache.lock);
1300         list_del_init(&b->list);
1301         mutex_unlock(&c->btree_cache.lock);
1302
1303         mutex_lock(&c->btree_root_lock);
1304         bch2_btree_id_root(c, b->c.btree_id)->b = b;
1305         mutex_unlock(&c->btree_root_lock);
1306
1307         bch2_recalc_btree_reserve(c);
1308 }
1309
1310 static int bch2_btree_set_root(struct btree_update *as,
1311                                struct btree_trans *trans,
1312                                struct btree_path *path,
1313                                struct btree *b,
1314                                bool nofail)
1315 {
1316         struct bch_fs *c = as->c;
1317
1318         trace_and_count(c, btree_node_set_root, trans, b);
1319
1320         struct btree *old = btree_node_root(c, b);
1321
1322         /*
1323          * Ensure no one is using the old root while we switch to the
1324          * new root:
1325          */
1326         if (nofail) {
1327                 bch2_btree_node_lock_write_nofail(trans, path, &old->c);
1328         } else {
1329                 int ret = bch2_btree_node_lock_write(trans, path, &old->c);
1330                 if (ret)
1331                         return ret;
1332         }
1333
1334         bch2_btree_set_root_inmem(c, b);
1335
1336         btree_update_updated_root(as, b);
1337
1338         /*
1339          * Unlock old root after new root is visible:
1340          *
1341          * The new root isn't persistent, but that's ok: we still have
1342          * an intent lock on the new root, and any updates that would
1343          * depend on the new root would have to update the new root.
1344          */
1345         bch2_btree_node_unlock_write(trans, path, old);
1346         return 0;
1347 }
1348
1349 /* Interior node updates: */
1350
1351 static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
1352                                         struct btree_trans *trans,
1353                                         struct btree_path *path,
1354                                         struct btree *b,
1355                                         struct btree_node_iter *node_iter,
1356                                         struct bkey_i *insert)
1357 {
1358         struct bch_fs *c = as->c;
1359         struct bkey_packed *k;
1360         struct printbuf buf = PRINTBUF;
1361         unsigned long old, new;
1362
1363         BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
1364                !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
1365
1366         if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
1367                 bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
1368
1369         if (bch2_bkey_validate(c, bkey_i_to_s_c(insert),
1370                               btree_node_type(b), BCH_VALIDATE_write) ?:
1371             bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) {
1372                 bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
1373                 dump_stack();
1374         }
1375
1376         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
1377                ARRAY_SIZE(as->journal_entries));
1378
1379         as->journal_u64s +=
1380                 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
1381                                   BCH_JSET_ENTRY_btree_keys,
1382                                   b->c.btree_id, b->c.level,
1383                                   insert, insert->k.u64s);
1384
1385         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1386                bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1387                 bch2_btree_node_iter_advance(node_iter, b);
1388
1389         bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
1390         set_btree_node_dirty_acct(c, b);
1391
1392         old = READ_ONCE(b->flags);
1393         do {
1394                 new = old;
1395
1396                 new &= ~BTREE_WRITE_TYPE_MASK;
1397                 new |= BTREE_WRITE_interior;
1398                 new |= 1 << BTREE_NODE_need_write;
1399         } while (!try_cmpxchg(&b->flags, &old, new));
1400
1401         printbuf_exit(&buf);
1402 }
1403
1404 static void
1405 bch2_btree_insert_keys_interior(struct btree_update *as,
1406                                 struct btree_trans *trans,
1407                                 struct btree_path *path,
1408                                 struct btree *b,
1409                                 struct btree_node_iter node_iter,
1410                                 struct keylist *keys)
1411 {
1412         struct bkey_i *insert = bch2_keylist_front(keys);
1413         struct bkey_packed *k;
1414
1415         BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
1416
1417         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1418                (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
1419                 ;
1420
1421         while (!bch2_keylist_empty(keys)) {
1422                 insert = bch2_keylist_front(keys);
1423
1424                 if (bpos_gt(insert->k.p, b->key.k.p))
1425                         break;
1426
1427                 bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
1428                 bch2_keylist_pop_front(keys);
1429         }
1430 }
1431
1432 static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
1433 {
1434         if (insert_keys)
1435                 for_each_keylist_key(insert_keys, k)
1436                         if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
1437                                 return true;
1438         return false;
1439 }
1440
1441 /*
1442  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1443  * node)
1444  */
1445 static void __btree_split_node(struct btree_update *as,
1446                                struct btree_trans *trans,
1447                                struct btree *b,
1448                                struct btree *n[2],
1449                                struct keylist *insert_keys)
1450 {
1451         struct bkey_packed *k;
1452         struct bpos n1_pos = POS_MIN;
1453         struct btree_node_iter iter;
1454         struct bset *bsets[2];
1455         struct bkey_format_state format[2];
1456         struct bkey_packed *out[2];
1457         struct bkey uk;
1458         unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
1459         struct { unsigned nr_keys, val_u64s; } nr_keys[2];
1460         int i;
1461
1462         memset(&nr_keys, 0, sizeof(nr_keys));
1463
1464         for (i = 0; i < 2; i++) {
1465                 BUG_ON(n[i]->nsets != 1);
1466
1467                 bsets[i] = btree_bset_first(n[i]);
1468                 out[i] = bsets[i]->start;
1469
1470                 SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
1471                 bch2_bkey_format_init(&format[i]);
1472         }
1473
1474         u64s = 0;
1475         for_each_btree_node_key(b, k, &iter) {
1476                 if (bkey_deleted(k))
1477                         continue;
1478
1479                 uk = bkey_unpack_key(b, k);
1480
1481                 if (b->c.level &&
1482                     u64s < n1_u64s &&
1483                     u64s + k->u64s >= n1_u64s &&
1484                     (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
1485                      key_deleted_in_insert(insert_keys, uk.p)))
1486                         n1_u64s += k->u64s;
1487
1488                 i = u64s >= n1_u64s;
1489                 u64s += k->u64s;
1490                 if (!i)
1491                         n1_pos = uk.p;
1492                 bch2_bkey_format_add_key(&format[i], &uk);
1493
1494                 nr_keys[i].nr_keys++;
1495                 nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
1496         }
1497
1498         btree_set_min(n[0], b->data->min_key);
1499         btree_set_max(n[0], n1_pos);
1500         btree_set_min(n[1], bpos_successor(n1_pos));
1501         btree_set_max(n[1], b->data->max_key);
1502
1503         for (i = 0; i < 2; i++) {
1504                 bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
1505                 bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
1506
1507                 n[i]->data->format = bch2_bkey_format_done(&format[i]);
1508
1509                 unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
1510                         nr_keys[i].val_u64s;
1511                 if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
1512                         n[i]->data->format = b->format;
1513
1514                 btree_node_set_format(n[i], n[i]->data->format);
1515         }
1516
1517         u64s = 0;
1518         for_each_btree_node_key(b, k, &iter) {
1519                 if (bkey_deleted(k))
1520                         continue;
1521
1522                 i = u64s >= n1_u64s;
1523                 u64s += k->u64s;
1524
1525                 if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
1526                                         ? &b->format: &bch2_bkey_format_current, k))
1527                         out[i]->format = KEY_FORMAT_LOCAL_BTREE;
1528                 else
1529                         bch2_bkey_unpack(b, (void *) out[i], k);
1530
1531                 out[i]->needs_whiteout = false;
1532
1533                 btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
1534                 out[i] = bkey_p_next(out[i]);
1535         }
1536
1537         for (i = 0; i < 2; i++) {
1538                 bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
1539
1540                 BUG_ON(!bsets[i]->u64s);
1541
1542                 set_btree_bset_end(n[i], n[i]->set);
1543
1544                 btree_node_reset_sib_u64s(n[i]);
1545
1546                 bch2_verify_btree_nr_keys(n[i]);
1547
1548                 BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
1549         }
1550 }
1551
1552 /*
1553  * For updates to interior nodes, we've got to do the insert before we split
1554  * because the stuff we're inserting has to be inserted atomically. Post split,
1555  * the keys might have to go in different nodes and the split would no longer be
1556  * atomic.
1557  *
1558  * Worse, if the insert is from btree node coalescing, if we do the insert after
1559  * we do the split (and pick the pivot) - the pivot we pick might be between
1560  * nodes that were coalesced, and thus in the middle of a child node post
1561  * coalescing:
1562  */
1563 static void btree_split_insert_keys(struct btree_update *as,
1564                                     struct btree_trans *trans,
1565                                     btree_path_idx_t path_idx,
1566                                     struct btree *b,
1567                                     struct keylist *keys)
1568 {
1569         struct btree_path *path = trans->paths + path_idx;
1570
1571         if (!bch2_keylist_empty(keys) &&
1572             bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
1573                 struct btree_node_iter node_iter;
1574
1575                 bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
1576
1577                 bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
1578
1579                 BUG_ON(bch2_btree_node_check_topology(trans, b));
1580         }
1581 }
1582
1583 static int btree_split(struct btree_update *as, struct btree_trans *trans,
1584                        btree_path_idx_t path, struct btree *b,
1585                        struct keylist *keys)
1586 {
1587         struct bch_fs *c = as->c;
1588         struct btree *parent = btree_node_parent(trans->paths + path, b);
1589         struct btree *n1, *n2 = NULL, *n3 = NULL;
1590         btree_path_idx_t path1 = 0, path2 = 0;
1591         u64 start_time = local_clock();
1592         int ret = 0;
1593
1594         bch2_verify_btree_nr_keys(b);
1595         BUG_ON(!parent && (b != btree_node_root(c, b)));
1596         BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
1597
1598         ret = bch2_btree_node_check_topology(trans, b);
1599         if (ret)
1600                 return ret;
1601
1602         bch2_btree_interior_update_will_free_node(as, b);
1603
1604         if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
1605                 struct btree *n[2];
1606
1607                 trace_and_count(c, btree_node_split, trans, b);
1608
1609                 n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
1610                 n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
1611
1612                 __btree_split_node(as, trans, b, n, keys);
1613
1614                 if (keys) {
1615                         btree_split_insert_keys(as, trans, path, n1, keys);
1616                         btree_split_insert_keys(as, trans, path, n2, keys);
1617                         BUG_ON(!bch2_keylist_empty(keys));
1618                 }
1619
1620                 bch2_btree_build_aux_trees(n2);
1621                 bch2_btree_build_aux_trees(n1);
1622
1623                 bch2_btree_update_add_new_node(as, n1);
1624                 bch2_btree_update_add_new_node(as, n2);
1625                 six_unlock_write(&n2->c.lock);
1626                 six_unlock_write(&n1->c.lock);
1627
1628                 path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
1629                 six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
1630                 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
1631                 bch2_btree_path_level_init(trans, trans->paths + path1, n1);
1632
1633                 path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
1634                 six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
1635                 mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
1636                 bch2_btree_path_level_init(trans, trans->paths + path2, n2);
1637
1638                 /*
1639                  * Note that on recursive parent_keys == keys, so we
1640                  * can't start adding new keys to parent_keys before emptying it
1641                  * out (which we did with btree_split_insert_keys() above)
1642                  */
1643                 bch2_keylist_add(&as->parent_keys, &n1->key);
1644                 bch2_keylist_add(&as->parent_keys, &n2->key);
1645
1646                 if (!parent) {
1647                         /* Depth increases, make a new root */
1648                         n3 = __btree_root_alloc(as, trans, b->c.level + 1);
1649
1650                         bch2_btree_update_add_new_node(as, n3);
1651                         six_unlock_write(&n3->c.lock);
1652
1653                         trans->paths[path2].locks_want++;
1654                         BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
1655                         six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
1656                         mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
1657                         bch2_btree_path_level_init(trans, trans->paths + path2, n3);
1658
1659                         n3->sib_u64s[0] = U16_MAX;
1660                         n3->sib_u64s[1] = U16_MAX;
1661
1662                         btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
1663                 }
1664         } else {
1665                 trace_and_count(c, btree_node_compact, trans, b);
1666
1667                 n1 = bch2_btree_node_alloc_replacement(as, trans, b);
1668
1669                 if (keys) {
1670                         btree_split_insert_keys(as, trans, path, n1, keys);
1671                         BUG_ON(!bch2_keylist_empty(keys));
1672                 }
1673
1674                 bch2_btree_build_aux_trees(n1);
1675                 bch2_btree_update_add_new_node(as, n1);
1676                 six_unlock_write(&n1->c.lock);
1677
1678                 path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
1679                 six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
1680                 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
1681                 bch2_btree_path_level_init(trans, trans->paths + path1, n1);
1682
1683                 if (parent)
1684                         bch2_keylist_add(&as->parent_keys, &n1->key);
1685         }
1686
1687         /* New nodes all written, now make them visible: */
1688
1689         if (parent) {
1690                 /* Split a non root node */
1691                 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
1692         } else if (n3) {
1693                 ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
1694         } else {
1695                 /* Root filled up but didn't need to be split */
1696                 ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
1697         }
1698
1699         if (ret)
1700                 goto err;
1701
1702         if (n3) {
1703                 bch2_btree_update_get_open_buckets(as, n3);
1704                 bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
1705         }
1706         if (n2) {
1707                 bch2_btree_update_get_open_buckets(as, n2);
1708                 bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
1709         }
1710         bch2_btree_update_get_open_buckets(as, n1);
1711         bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
1712
1713         /*
1714          * The old node must be freed (in memory) _before_ unlocking the new
1715          * nodes - else another thread could re-acquire a read lock on the old
1716          * node after another thread has locked and updated the new node, thus
1717          * seeing stale data:
1718          */
1719         bch2_btree_node_free_inmem(trans, trans->paths + path, b);
1720
1721         if (n3)
1722                 bch2_trans_node_add(trans, trans->paths + path, n3);
1723         if (n2)
1724                 bch2_trans_node_add(trans, trans->paths + path2, n2);
1725         bch2_trans_node_add(trans, trans->paths + path1, n1);
1726
1727         if (n3)
1728                 six_unlock_intent(&n3->c.lock);
1729         if (n2)
1730                 six_unlock_intent(&n2->c.lock);
1731         six_unlock_intent(&n1->c.lock);
1732 out:
1733         if (path2) {
1734                 __bch2_btree_path_unlock(trans, trans->paths + path2);
1735                 bch2_path_put(trans, path2, true);
1736         }
1737         if (path1) {
1738                 __bch2_btree_path_unlock(trans, trans->paths + path1);
1739                 bch2_path_put(trans, path1, true);
1740         }
1741
1742         bch2_trans_verify_locks(trans);
1743
1744         bch2_time_stats_update(&c->times[n2
1745                                ? BCH_TIME_btree_node_split
1746                                : BCH_TIME_btree_node_compact],
1747                                start_time);
1748         return ret;
1749 err:
1750         if (n3)
1751                 bch2_btree_node_free_never_used(as, trans, n3);
1752         if (n2)
1753                 bch2_btree_node_free_never_used(as, trans, n2);
1754         bch2_btree_node_free_never_used(as, trans, n1);
1755         goto out;
1756 }
1757
1758 /**
1759  * bch2_btree_insert_node - insert bkeys into a given btree node
1760  *
1761  * @as:                 btree_update object
1762  * @trans:              btree_trans object
1763  * @path_idx:           path that points to current node
1764  * @b:                  node to insert keys into
1765  * @keys:               list of keys to insert
1766  *
1767  * Returns: 0 on success, typically transaction restart error on failure
1768  *
1769  * Inserts as many keys as it can into a given btree node, splitting it if full.
1770  * If a split occurred, this function will return early. This can only happen
1771  * for leaf nodes -- inserts into interior nodes have to be atomic.
1772  */
1773 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
1774                                   btree_path_idx_t path_idx, struct btree *b,
1775                                   struct keylist *keys)
1776 {
1777         struct bch_fs *c = as->c;
1778         struct btree_path *path = trans->paths + path_idx, *linked;
1779         unsigned i;
1780         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1781         int old_live_u64s = b->nr.live_u64s;
1782         int live_u64s_added, u64s_added;
1783         int ret;
1784
1785         lockdep_assert_held(&c->gc_lock);
1786         BUG_ON(!btree_node_intent_locked(path, b->c.level));
1787         BUG_ON(!b->c.level);
1788         BUG_ON(!as || as->b);
1789         bch2_verify_keylist_sorted(keys);
1790
1791         ret = bch2_btree_node_lock_write(trans, path, &b->c);
1792         if (ret)
1793                 return ret;
1794
1795         bch2_btree_node_prep_for_write(trans, path, b);
1796
1797         if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
1798                 bch2_btree_node_unlock_write(trans, path, b);
1799                 goto split;
1800         }
1801
1802         ret = bch2_btree_node_check_topology(trans, b);
1803         if (ret) {
1804                 bch2_btree_node_unlock_write(trans, path, b);
1805                 return ret;
1806         }
1807
1808         bch2_btree_insert_keys_interior(as, trans, path, b,
1809                                         path->l[b->c.level].iter, keys);
1810
1811         trans_for_each_path_with_node(trans, b, linked, i)
1812                 bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
1813
1814         bch2_trans_verify_paths(trans);
1815
1816         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1817         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1818
1819         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1820                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1821         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1822                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1823
1824         if (u64s_added > live_u64s_added &&
1825             bch2_maybe_compact_whiteouts(c, b))
1826                 bch2_trans_node_reinit_iter(trans, b);
1827
1828         btree_update_updated_node(as, b);
1829         bch2_btree_node_unlock_write(trans, path, b);
1830
1831         BUG_ON(bch2_btree_node_check_topology(trans, b));
1832         return 0;
1833 split:
1834         /*
1835          * We could attempt to avoid the transaction restart, by calling
1836          * bch2_btree_path_upgrade() and allocating more nodes:
1837          */
1838         if (b->c.level >= as->update_level_end) {
1839                 trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
1840                 return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
1841         }
1842
1843         return btree_split(as, trans, path_idx, b, keys);
1844 }
1845
1846 int bch2_btree_split_leaf(struct btree_trans *trans,
1847                           btree_path_idx_t path,
1848                           unsigned flags)
1849 {
1850         /* btree_split & merge may both cause paths array to be reallocated */
1851         struct btree *b = path_l(trans->paths + path)->b;
1852         struct btree_update *as;
1853         unsigned l;
1854         int ret = 0;
1855
1856         as = bch2_btree_update_start(trans, trans->paths + path,
1857                                      trans->paths[path].level,
1858                                      true, flags);
1859         if (IS_ERR(as))
1860                 return PTR_ERR(as);
1861
1862         ret = btree_split(as, trans, path, b, NULL);
1863         if (ret) {
1864                 bch2_btree_update_free(as, trans);
1865                 return ret;
1866         }
1867
1868         bch2_btree_update_done(as, trans);
1869
1870         for (l = trans->paths[path].level + 1;
1871              btree_node_intent_locked(&trans->paths[path], l) && !ret;
1872              l++)
1873                 ret = bch2_foreground_maybe_merge(trans, path, l, flags);
1874
1875         return ret;
1876 }
1877
1878 static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
1879                                    btree_path_idx_t path_idx)
1880 {
1881         struct bch_fs *c = as->c;
1882         struct btree_path *path = trans->paths + path_idx;
1883         struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
1884
1885         BUG_ON(!btree_node_locked(path, b->c.level));
1886
1887         n = __btree_root_alloc(as, trans, b->c.level + 1);
1888
1889         bch2_btree_update_add_new_node(as, n);
1890         six_unlock_write(&n->c.lock);
1891
1892         path->locks_want++;
1893         BUG_ON(btree_node_locked(path, n->c.level));
1894         six_lock_increment(&n->c.lock, SIX_LOCK_intent);
1895         mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
1896         bch2_btree_path_level_init(trans, path, n);
1897
1898         n->sib_u64s[0] = U16_MAX;
1899         n->sib_u64s[1] = U16_MAX;
1900
1901         bch2_keylist_add(&as->parent_keys, &b->key);
1902         btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
1903
1904         int ret = bch2_btree_set_root(as, trans, path, n, true);
1905         BUG_ON(ret);
1906
1907         bch2_btree_update_get_open_buckets(as, n);
1908         bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
1909         bch2_trans_node_add(trans, path, n);
1910         six_unlock_intent(&n->c.lock);
1911
1912         mutex_lock(&c->btree_cache.lock);
1913         list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
1914         mutex_unlock(&c->btree_cache.lock);
1915
1916         bch2_trans_verify_locks(trans);
1917 }
1918
1919 int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
1920 {
1921         struct bch_fs *c = trans->c;
1922         struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
1923
1924         if (btree_node_fake(b))
1925                 return bch2_btree_split_leaf(trans, path, flags);
1926
1927         struct btree_update *as =
1928                 bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
1929         if (IS_ERR(as))
1930                 return PTR_ERR(as);
1931
1932         __btree_increase_depth(as, trans, path);
1933         bch2_btree_update_done(as, trans);
1934         return 0;
1935 }
1936
1937 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
1938                                   btree_path_idx_t path,
1939                                   unsigned level,
1940                                   unsigned flags,
1941                                   enum btree_node_sibling sib)
1942 {
1943         struct bch_fs *c = trans->c;
1944         struct btree_update *as;
1945         struct bkey_format_state new_s;
1946         struct bkey_format new_f;
1947         struct bkey_i delete;
1948         struct btree *b, *m, *n, *prev, *next, *parent;
1949         struct bpos sib_pos;
1950         size_t sib_u64s;
1951         enum btree_id btree = trans->paths[path].btree_id;
1952         btree_path_idx_t sib_path = 0, new_path = 0;
1953         u64 start_time = local_clock();
1954         int ret = 0;
1955
1956         bch2_trans_verify_not_in_restart(trans);
1957         bch2_trans_verify_not_unlocked(trans);
1958         BUG_ON(!trans->paths[path].should_be_locked);
1959         BUG_ON(!btree_node_locked(&trans->paths[path], level));
1960
1961         /*
1962          * Work around a deadlock caused by the btree write buffer not doing
1963          * merges and leaving tons of merges for us to do - we really don't need
1964          * to be doing merges at all from the interior update path, and if the
1965          * interior update path is generating too many new interior updates we
1966          * deadlock:
1967          */
1968         if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
1969                 return 0;
1970
1971         if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
1972                 flags &= ~BCH_WATERMARK_MASK;
1973                 flags |= BCH_WATERMARK_btree;
1974                 flags |= BCH_TRANS_COMMIT_journal_reclaim;
1975         }
1976
1977         b = trans->paths[path].l[level].b;
1978
1979         if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
1980             (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
1981                 b->sib_u64s[sib] = U16_MAX;
1982                 return 0;
1983         }
1984
1985         sib_pos = sib == btree_prev_sib
1986                 ? bpos_predecessor(b->data->min_key)
1987                 : bpos_successor(b->data->max_key);
1988
1989         sib_path = bch2_path_get(trans, btree, sib_pos,
1990                                  U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
1991         ret = bch2_btree_path_traverse(trans, sib_path, false);
1992         if (ret)
1993                 goto err;
1994
1995         btree_path_set_should_be_locked(trans, trans->paths + sib_path);
1996
1997         m = trans->paths[sib_path].l[level].b;
1998
1999         if (btree_node_parent(trans->paths + path, b) !=
2000             btree_node_parent(trans->paths + sib_path, m)) {
2001                 b->sib_u64s[sib] = U16_MAX;
2002                 goto out;
2003         }
2004
2005         if (sib == btree_prev_sib) {
2006                 prev = m;
2007                 next = b;
2008         } else {
2009                 prev = b;
2010                 next = m;
2011         }
2012
2013         if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
2014                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
2015
2016                 bch2_bpos_to_text(&buf1, prev->data->max_key);
2017                 bch2_bpos_to_text(&buf2, next->data->min_key);
2018                 bch_err(c,
2019                         "%s(): btree topology error:\n"
2020                         "  prev ends at   %s\n"
2021                         "  next starts at %s",
2022                         __func__, buf1.buf, buf2.buf);
2023                 printbuf_exit(&buf1);
2024                 printbuf_exit(&buf2);
2025                 ret = bch2_topology_error(c);
2026                 goto err;
2027         }
2028
2029         bch2_bkey_format_init(&new_s);
2030         bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
2031         __bch2_btree_calc_format(&new_s, prev);
2032         __bch2_btree_calc_format(&new_s, next);
2033         bch2_bkey_format_add_pos(&new_s, next->data->max_key);
2034         new_f = bch2_bkey_format_done(&new_s);
2035
2036         sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
2037                 btree_node_u64s_with_format(m->nr, &m->format, &new_f);
2038
2039         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
2040                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
2041                 sib_u64s /= 2;
2042                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
2043         }
2044
2045         sib_u64s = min(sib_u64s, btree_max_u64s(c));
2046         sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
2047         b->sib_u64s[sib] = sib_u64s;
2048
2049         if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
2050                 goto out;
2051
2052         parent = btree_node_parent(trans->paths + path, b);
2053         as = bch2_btree_update_start(trans, trans->paths + path, level, false,
2054                                      BCH_TRANS_COMMIT_no_enospc|flags);
2055         ret = PTR_ERR_OR_ZERO(as);
2056         if (ret)
2057                 goto err;
2058
2059         trace_and_count(c, btree_node_merge, trans, b);
2060
2061         bch2_btree_interior_update_will_free_node(as, b);
2062         bch2_btree_interior_update_will_free_node(as, m);
2063
2064         n = bch2_btree_node_alloc(as, trans, b->c.level);
2065
2066         SET_BTREE_NODE_SEQ(n->data,
2067                            max(BTREE_NODE_SEQ(b->data),
2068                                BTREE_NODE_SEQ(m->data)) + 1);
2069
2070         btree_set_min(n, prev->data->min_key);
2071         btree_set_max(n, next->data->max_key);
2072
2073         n->data->format  = new_f;
2074         btree_node_set_format(n, new_f);
2075
2076         bch2_btree_sort_into(c, n, prev);
2077         bch2_btree_sort_into(c, n, next);
2078
2079         bch2_btree_build_aux_trees(n);
2080         bch2_btree_update_add_new_node(as, n);
2081         six_unlock_write(&n->c.lock);
2082
2083         new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
2084         six_lock_increment(&n->c.lock, SIX_LOCK_intent);
2085         mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
2086         bch2_btree_path_level_init(trans, trans->paths + new_path, n);
2087
2088         bkey_init(&delete.k);
2089         delete.k.p = prev->key.k.p;
2090         bch2_keylist_add(&as->parent_keys, &delete);
2091         bch2_keylist_add(&as->parent_keys, &n->key);
2092
2093         bch2_trans_verify_paths(trans);
2094
2095         ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
2096         if (ret)
2097                 goto err_free_update;
2098
2099         bch2_trans_verify_paths(trans);
2100
2101         bch2_btree_update_get_open_buckets(as, n);
2102         bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
2103
2104         bch2_btree_node_free_inmem(trans, trans->paths + path, b);
2105         bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
2106
2107         bch2_trans_node_add(trans, trans->paths + path, n);
2108
2109         bch2_trans_verify_paths(trans);
2110
2111         six_unlock_intent(&n->c.lock);
2112
2113         bch2_btree_update_done(as, trans);
2114
2115         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
2116 out:
2117 err:
2118         if (new_path)
2119                 bch2_path_put(trans, new_path, true);
2120         bch2_path_put(trans, sib_path, true);
2121         bch2_trans_verify_locks(trans);
2122         if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
2123                 ret = 0;
2124         if (!ret)
2125                 ret = bch2_trans_relock(trans);
2126         return ret;
2127 err_free_update:
2128         bch2_btree_node_free_never_used(as, trans, n);
2129         bch2_btree_update_free(as, trans);
2130         goto out;
2131 }
2132
2133 int bch2_btree_node_rewrite(struct btree_trans *trans,
2134                             struct btree_iter *iter,
2135                             struct btree *b,
2136                             unsigned flags)
2137 {
2138         struct bch_fs *c = trans->c;
2139         struct btree *n, *parent;
2140         struct btree_update *as;
2141         btree_path_idx_t new_path = 0;
2142         int ret;
2143
2144         flags |= BCH_TRANS_COMMIT_no_enospc;
2145
2146         struct btree_path *path = btree_iter_path(trans, iter);
2147         parent = btree_node_parent(path, b);
2148         as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
2149         ret = PTR_ERR_OR_ZERO(as);
2150         if (ret)
2151                 goto out;
2152
2153         bch2_btree_interior_update_will_free_node(as, b);
2154
2155         n = bch2_btree_node_alloc_replacement(as, trans, b);
2156
2157         bch2_btree_build_aux_trees(n);
2158         bch2_btree_update_add_new_node(as, n);
2159         six_unlock_write(&n->c.lock);
2160
2161         new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
2162         six_lock_increment(&n->c.lock, SIX_LOCK_intent);
2163         mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
2164         bch2_btree_path_level_init(trans, trans->paths + new_path, n);
2165
2166         trace_and_count(c, btree_node_rewrite, trans, b);
2167
2168         if (parent) {
2169                 bch2_keylist_add(&as->parent_keys, &n->key);
2170                 ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
2171         } else {
2172                 ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
2173         }
2174
2175         if (ret)
2176                 goto err;
2177
2178         bch2_btree_update_get_open_buckets(as, n);
2179         bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
2180
2181         bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
2182
2183         bch2_trans_node_add(trans, trans->paths + iter->path, n);
2184         six_unlock_intent(&n->c.lock);
2185
2186         bch2_btree_update_done(as, trans);
2187 out:
2188         if (new_path)
2189                 bch2_path_put(trans, new_path, true);
2190         bch2_trans_downgrade(trans);
2191         return ret;
2192 err:
2193         bch2_btree_node_free_never_used(as, trans, n);
2194         bch2_btree_update_free(as, trans);
2195         goto out;
2196 }
2197
2198 struct async_btree_rewrite {
2199         struct bch_fs           *c;
2200         struct work_struct      work;
2201         struct list_head        list;
2202         enum btree_id           btree_id;
2203         unsigned                level;
2204         struct bpos             pos;
2205         __le64                  seq;
2206 };
2207
2208 static int async_btree_node_rewrite_trans(struct btree_trans *trans,
2209                                           struct async_btree_rewrite *a)
2210 {
2211         struct bch_fs *c = trans->c;
2212         struct btree_iter iter;
2213         struct btree *b;
2214         int ret;
2215
2216         bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
2217                                   BTREE_MAX_DEPTH, a->level, 0);
2218         b = bch2_btree_iter_peek_node(&iter);
2219         ret = PTR_ERR_OR_ZERO(b);
2220         if (ret)
2221                 goto out;
2222
2223         if (!b || b->data->keys.seq != a->seq) {
2224                 struct printbuf buf = PRINTBUF;
2225
2226                 if (b)
2227                         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
2228                 else
2229                         prt_str(&buf, "(null");
2230                 bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
2231                          __func__, a->seq, buf.buf);
2232                 printbuf_exit(&buf);
2233                 goto out;
2234         }
2235
2236         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
2237 out:
2238         bch2_trans_iter_exit(trans, &iter);
2239
2240         return ret;
2241 }
2242
2243 static void async_btree_node_rewrite_work(struct work_struct *work)
2244 {
2245         struct async_btree_rewrite *a =
2246                 container_of(work, struct async_btree_rewrite, work);
2247         struct bch_fs *c = a->c;
2248
2249         int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
2250         bch_err_fn_ratelimited(c, ret);
2251         bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
2252         kfree(a);
2253 }
2254
2255 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
2256 {
2257         struct async_btree_rewrite *a;
2258         int ret;
2259
2260         a = kmalloc(sizeof(*a), GFP_NOFS);
2261         if (!a) {
2262                 bch_err(c, "%s: error allocating memory", __func__);
2263                 return;
2264         }
2265
2266         a->c            = c;
2267         a->btree_id     = b->c.btree_id;
2268         a->level        = b->c.level;
2269         a->pos          = b->key.k.p;
2270         a->seq          = b->data->keys.seq;
2271         INIT_WORK(&a->work, async_btree_node_rewrite_work);
2272
2273         if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
2274                 mutex_lock(&c->pending_node_rewrites_lock);
2275                 list_add(&a->list, &c->pending_node_rewrites);
2276                 mutex_unlock(&c->pending_node_rewrites_lock);
2277                 return;
2278         }
2279
2280         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
2281                 if (test_bit(BCH_FS_started, &c->flags)) {
2282                         bch_err(c, "%s: error getting c->writes ref", __func__);
2283                         kfree(a);
2284                         return;
2285                 }
2286
2287                 ret = bch2_fs_read_write_early(c);
2288                 bch_err_msg(c, ret, "going read-write");
2289                 if (ret) {
2290                         kfree(a);
2291                         return;
2292                 }
2293
2294                 bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
2295         }
2296
2297         queue_work(c->btree_node_rewrite_worker, &a->work);
2298 }
2299
2300 void bch2_do_pending_node_rewrites(struct bch_fs *c)
2301 {
2302         struct async_btree_rewrite *a, *n;
2303
2304         mutex_lock(&c->pending_node_rewrites_lock);
2305         list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
2306                 list_del(&a->list);
2307
2308                 bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
2309                 queue_work(c->btree_node_rewrite_worker, &a->work);
2310         }
2311         mutex_unlock(&c->pending_node_rewrites_lock);
2312 }
2313
2314 void bch2_free_pending_node_rewrites(struct bch_fs *c)
2315 {
2316         struct async_btree_rewrite *a, *n;
2317
2318         mutex_lock(&c->pending_node_rewrites_lock);
2319         list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
2320                 list_del(&a->list);
2321
2322                 kfree(a);
2323         }
2324         mutex_unlock(&c->pending_node_rewrites_lock);
2325 }
2326
2327 static int __bch2_btree_node_update_key(struct btree_trans *trans,
2328                                         struct btree_iter *iter,
2329                                         struct btree *b, struct btree *new_hash,
2330                                         struct bkey_i *new_key,
2331                                         unsigned commit_flags,
2332                                         bool skip_triggers)
2333 {
2334         struct bch_fs *c = trans->c;
2335         struct btree_iter iter2 = { NULL };
2336         struct btree *parent;
2337         int ret;
2338
2339         if (!skip_triggers) {
2340                 ret   = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
2341                                              bkey_i_to_s_c(&b->key),
2342                                              BTREE_TRIGGER_transactional) ?:
2343                         bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
2344                                              bkey_i_to_s(new_key),
2345                                              BTREE_TRIGGER_transactional);
2346                 if (ret)
2347                         return ret;
2348         }
2349
2350         if (new_hash) {
2351                 bkey_copy(&new_hash->key, new_key);
2352                 ret = bch2_btree_node_hash_insert(&c->btree_cache,
2353                                 new_hash, b->c.level, b->c.btree_id);
2354                 BUG_ON(ret);
2355         }
2356
2357         parent = btree_node_parent(btree_iter_path(trans, iter), b);
2358         if (parent) {
2359                 bch2_trans_copy_iter(&iter2, iter);
2360
2361                 iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
2362                                 iter2.flags & BTREE_ITER_intent,
2363                                 _THIS_IP_);
2364
2365                 struct btree_path *path2 = btree_iter_path(trans, &iter2);
2366                 BUG_ON(path2->level != b->c.level);
2367                 BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
2368
2369                 btree_path_set_level_up(trans, path2);
2370
2371                 trans->paths_sorted = false;
2372
2373                 ret   = bch2_btree_iter_traverse(&iter2) ?:
2374                         bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
2375                 if (ret)
2376                         goto err;
2377         } else {
2378                 BUG_ON(btree_node_root(c, b) != b);
2379
2380                 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
2381                                        jset_u64s(new_key->k.u64s));
2382                 ret = PTR_ERR_OR_ZERO(e);
2383                 if (ret)
2384                         return ret;
2385
2386                 journal_entry_set(e,
2387                                   BCH_JSET_ENTRY_btree_root,
2388                                   b->c.btree_id, b->c.level,
2389                                   new_key, new_key->k.u64s);
2390         }
2391
2392         ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
2393         if (ret)
2394                 goto err;
2395
2396         bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
2397
2398         if (new_hash) {
2399                 mutex_lock(&c->btree_cache.lock);
2400                 bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
2401
2402                 __bch2_btree_node_hash_remove(&c->btree_cache, b);
2403
2404                 bkey_copy(&b->key, new_key);
2405                 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
2406                 BUG_ON(ret);
2407                 mutex_unlock(&c->btree_cache.lock);
2408         } else {
2409                 bkey_copy(&b->key, new_key);
2410         }
2411
2412         bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
2413 out:
2414         bch2_trans_iter_exit(trans, &iter2);
2415         return ret;
2416 err:
2417         if (new_hash) {
2418                 mutex_lock(&c->btree_cache.lock);
2419                 bch2_btree_node_hash_remove(&c->btree_cache, b);
2420                 mutex_unlock(&c->btree_cache.lock);
2421         }
2422         goto out;
2423 }
2424
2425 int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
2426                                struct btree *b, struct bkey_i *new_key,
2427                                unsigned commit_flags, bool skip_triggers)
2428 {
2429         struct bch_fs *c = trans->c;
2430         struct btree *new_hash = NULL;
2431         struct btree_path *path = btree_iter_path(trans, iter);
2432         struct closure cl;
2433         int ret = 0;
2434
2435         ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
2436         if (ret)
2437                 return ret;
2438
2439         closure_init_stack(&cl);
2440
2441         /*
2442          * check btree_ptr_hash_val() after @b is locked by
2443          * btree_iter_traverse():
2444          */
2445         if (btree_ptr_hash_val(new_key) != b->hash_val) {
2446                 ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
2447                 if (ret) {
2448                         ret = drop_locks_do(trans, (closure_sync(&cl), 0));
2449                         if (ret)
2450                                 return ret;
2451                 }
2452
2453                 new_hash = bch2_btree_node_mem_alloc(trans, false);
2454                 ret = PTR_ERR_OR_ZERO(new_hash);
2455                 if (ret)
2456                         goto err;
2457         }
2458
2459         path->intent_ref++;
2460         ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
2461                                            commit_flags, skip_triggers);
2462         --path->intent_ref;
2463
2464         if (new_hash)
2465                 bch2_btree_node_to_freelist(c, new_hash);
2466 err:
2467         closure_sync(&cl);
2468         bch2_btree_cache_cannibalize_unlock(trans);
2469         return ret;
2470 }
2471
2472 int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
2473                                         struct btree *b, struct bkey_i *new_key,
2474                                         unsigned commit_flags, bool skip_triggers)
2475 {
2476         struct btree_iter iter;
2477         int ret;
2478
2479         bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
2480                                   BTREE_MAX_DEPTH, b->c.level,
2481                                   BTREE_ITER_intent);
2482         ret = bch2_btree_iter_traverse(&iter);
2483         if (ret)
2484                 goto out;
2485
2486         /* has node been freed? */
2487         if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
2488                 /* node has been freed: */
2489                 BUG_ON(!btree_node_dying(b));
2490                 goto out;
2491         }
2492
2493         BUG_ON(!btree_node_hashed(b));
2494
2495         bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
2496                             !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
2497
2498         ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
2499                                          commit_flags, skip_triggers);
2500 out:
2501         bch2_trans_iter_exit(trans, &iter);
2502         return ret;
2503 }
2504
2505 /* Init code: */
2506
2507 /*
2508  * Only for filesystem bringup, when first reading the btree roots or allocating
2509  * btree roots when initializing a new filesystem:
2510  */
2511 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
2512 {
2513         BUG_ON(btree_node_root(c, b));
2514
2515         bch2_btree_set_root_inmem(c, b);
2516 }
2517
2518 int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
2519 {
2520         struct bch_fs *c = trans->c;
2521         struct closure cl;
2522         struct btree *b;
2523         int ret;
2524
2525         closure_init_stack(&cl);
2526
2527         do {
2528                 ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
2529                 closure_sync(&cl);
2530         } while (ret);
2531
2532         b = bch2_btree_node_mem_alloc(trans, false);
2533         bch2_btree_cache_cannibalize_unlock(trans);
2534
2535         ret = PTR_ERR_OR_ZERO(b);
2536         if (ret)
2537                 return ret;
2538
2539         set_btree_node_fake(b);
2540         set_btree_node_need_rewrite(b);
2541         b->c.level      = level;
2542         b->c.btree_id   = id;
2543
2544         bkey_btree_ptr_init(&b->key);
2545         b->key.k.p = SPOS_MAX;
2546         *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
2547
2548         bch2_bset_init_first(b, &b->data->keys);
2549         bch2_btree_build_aux_trees(b);
2550
2551         b->data->flags = 0;
2552         btree_set_min(b, POS_MIN);
2553         btree_set_max(b, SPOS_MAX);
2554         b->data->format = bch2_btree_calc_format(b);
2555         btree_node_set_format(b, b->data->format);
2556
2557         ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
2558                                           b->c.level, b->c.btree_id);
2559         BUG_ON(ret);
2560
2561         bch2_btree_set_root_inmem(c, b);
2562
2563         six_unlock_write(&b->c.lock);
2564         six_unlock_intent(&b->c.lock);
2565         return 0;
2566 }
2567
2568 void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
2569 {
2570         bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
2571 }
2572
2573 static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
2574 {
2575         prt_printf(out, "%ps: ", (void *) as->ip_started);
2576         bch2_trans_commit_flags_to_text(out, as->flags);
2577
2578         prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
2579                    bch2_btree_id_str(as->btree_id),
2580                    as->update_level_start,
2581                    as->update_level_end,
2582                    bch2_btree_update_modes[as->mode],
2583                    as->nodes_written,
2584                    closure_nr_remaining(&as->cl),
2585                    as->journal.seq);
2586 }
2587
2588 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
2589 {
2590         struct btree_update *as;
2591
2592         mutex_lock(&c->btree_interior_update_lock);
2593         list_for_each_entry(as, &c->btree_interior_update_list, list)
2594                 bch2_btree_update_to_text(out, as);
2595         mutex_unlock(&c->btree_interior_update_lock);
2596 }
2597
2598 static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
2599 {
2600         bool ret;
2601
2602         mutex_lock(&c->btree_interior_update_lock);
2603         ret = !list_empty(&c->btree_interior_update_list);
2604         mutex_unlock(&c->btree_interior_update_lock);
2605
2606         return ret;
2607 }
2608
2609 bool bch2_btree_interior_updates_flush(struct bch_fs *c)
2610 {
2611         bool ret = bch2_btree_interior_updates_pending(c);
2612
2613         if (ret)
2614                 closure_wait_event(&c->btree_interior_update_wait,
2615                                    !bch2_btree_interior_updates_pending(c));
2616         return ret;
2617 }
2618
2619 void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
2620 {
2621         struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
2622
2623         mutex_lock(&c->btree_root_lock);
2624
2625         r->level = entry->level;
2626         r->alive = true;
2627         bkey_copy(&r->key, (struct bkey_i *) entry->start);
2628
2629         mutex_unlock(&c->btree_root_lock);
2630 }
2631
2632 struct jset_entry *
2633 bch2_btree_roots_to_journal_entries(struct bch_fs *c,
2634                                     struct jset_entry *end,
2635                                     unsigned long skip)
2636 {
2637         unsigned i;
2638
2639         mutex_lock(&c->btree_root_lock);
2640
2641         for (i = 0; i < btree_id_nr_alive(c); i++) {
2642                 struct btree_root *r = bch2_btree_id_root(c, i);
2643
2644                 if (r->alive && !test_bit(i, &skip)) {
2645                         journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
2646                                           i, r->level, &r->key, r->key.k.u64s);
2647                         end = vstruct_next(end);
2648                 }
2649         }
2650
2651         mutex_unlock(&c->btree_root_lock);
2652
2653         return end;
2654 }
2655
2656 static void bch2_btree_alloc_to_text(struct printbuf *out,
2657                                      struct bch_fs *c,
2658                                      struct btree_alloc *a)
2659 {
2660         printbuf_indent_add(out, 2);
2661         bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
2662         prt_newline(out);
2663
2664         struct open_bucket *ob;
2665         unsigned i;
2666         open_bucket_for_each(c, &a->ob, ob, i)
2667                 bch2_open_bucket_to_text(out, c, ob);
2668
2669         printbuf_indent_sub(out, 2);
2670 }
2671
2672 void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
2673 {
2674         for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
2675                 bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
2676 }
2677
2678 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
2679 {
2680         if (c->btree_node_rewrite_worker)
2681                 destroy_workqueue(c->btree_node_rewrite_worker);
2682         if (c->btree_interior_update_worker)
2683                 destroy_workqueue(c->btree_interior_update_worker);
2684         mempool_exit(&c->btree_interior_update_pool);
2685 }
2686
2687 void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
2688 {
2689         mutex_init(&c->btree_reserve_cache_lock);
2690         INIT_LIST_HEAD(&c->btree_interior_update_list);
2691         INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
2692         mutex_init(&c->btree_interior_update_lock);
2693         INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
2694
2695         INIT_LIST_HEAD(&c->pending_node_rewrites);
2696         mutex_init(&c->pending_node_rewrites_lock);
2697 }
2698
2699 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
2700 {
2701         c->btree_interior_update_worker =
2702                 alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
2703         if (!c->btree_interior_update_worker)
2704                 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
2705
2706         c->btree_node_rewrite_worker =
2707                 alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
2708         if (!c->btree_node_rewrite_worker)
2709                 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
2710
2711         if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
2712                                       sizeof(struct btree_update)))
2713                 return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
2714
2715         return 0;
2716 }
This page took 0.184923 seconds and 4 git commands to generate.