]> Git Repo - linux.git/blob - fs/bcachefs/buckets.c
cifs: Add a tracepoint to track credits involved in R/W requests
[linux.git] / fs / bcachefs / buckets.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code for manipulating bucket marks for garbage collection.
4  *
5  * Copyright 2014 Datera, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "alloc_background.h"
10 #include "backpointers.h"
11 #include "bset.h"
12 #include "btree_gc.h"
13 #include "btree_update.h"
14 #include "buckets.h"
15 #include "buckets_waiting_for_journal.h"
16 #include "ec.h"
17 #include "error.h"
18 #include "inode.h"
19 #include "movinggc.h"
20 #include "recovery.h"
21 #include "reflink.h"
22 #include "replicas.h"
23 #include "subvolume.h"
24 #include "trace.h"
25
26 #include <linux/preempt.h>
27
28 static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
29                                               enum bch_data_type data_type,
30                                               s64 sectors)
31 {
32         switch (data_type) {
33         case BCH_DATA_btree:
34                 fs_usage->btree         += sectors;
35                 break;
36         case BCH_DATA_user:
37         case BCH_DATA_parity:
38                 fs_usage->data          += sectors;
39                 break;
40         case BCH_DATA_cached:
41                 fs_usage->cached        += sectors;
42                 break;
43         default:
44                 break;
45         }
46 }
47
48 void bch2_fs_usage_initialize(struct bch_fs *c)
49 {
50         percpu_down_write(&c->mark_lock);
51         struct bch_fs_usage *usage = c->usage_base;
52
53         for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
54                 bch2_fs_usage_acc_to_base(c, i);
55
56         for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
57                 usage->b.reserved += usage->persistent_reserved[i];
58
59         for (unsigned i = 0; i < c->replicas.nr; i++) {
60                 struct bch_replicas_entry_v1 *e =
61                         cpu_replicas_entry(&c->replicas, i);
62
63                 fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
64         }
65
66         for_each_member_device(c, ca) {
67                 struct bch_dev_usage dev = bch2_dev_usage_read(ca);
68
69                 usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
70                                     dev.d[BCH_DATA_journal].buckets) *
71                         ca->mi.bucket_size;
72         }
73
74         percpu_up_write(&c->mark_lock);
75 }
76
77 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
78                                                   unsigned journal_seq,
79                                                   bool gc)
80 {
81         BUG_ON(!gc && !journal_seq);
82
83         return this_cpu_ptr(gc
84                             ? ca->usage_gc
85                             : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
86 }
87
88 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
89 {
90         struct bch_fs *c = ca->fs;
91         unsigned seq, i, u64s = dev_usage_u64s();
92
93         do {
94                 seq = read_seqcount_begin(&c->usage_lock);
95                 memcpy(usage, ca->usage_base, u64s * sizeof(u64));
96                 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
97                         acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
98         } while (read_seqcount_retry(&c->usage_lock, seq));
99 }
100
101 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
102 {
103         ssize_t offset = v - (u64 *) c->usage_base;
104         unsigned i, seq;
105         u64 ret;
106
107         BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
108         percpu_rwsem_assert_held(&c->mark_lock);
109
110         do {
111                 seq = read_seqcount_begin(&c->usage_lock);
112                 ret = *v;
113
114                 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
115                         ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
116         } while (read_seqcount_retry(&c->usage_lock, seq));
117
118         return ret;
119 }
120
121 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
122 {
123         struct bch_fs_usage_online *ret;
124         unsigned nr_replicas = READ_ONCE(c->replicas.nr);
125         unsigned seq, i;
126 retry:
127         ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
128         if (unlikely(!ret))
129                 return NULL;
130
131         percpu_down_read(&c->mark_lock);
132
133         if (nr_replicas != c->replicas.nr) {
134                 nr_replicas = c->replicas.nr;
135                 percpu_up_read(&c->mark_lock);
136                 kfree(ret);
137                 goto retry;
138         }
139
140         ret->online_reserved = percpu_u64_get(c->online_reserved);
141
142         do {
143                 seq = read_seqcount_begin(&c->usage_lock);
144                 unsafe_memcpy(&ret->u, c->usage_base,
145                               __fs_usage_u64s(nr_replicas) * sizeof(u64),
146                               "embedded variable length struct");
147                 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
148                         acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
149                                         __fs_usage_u64s(nr_replicas));
150         } while (read_seqcount_retry(&c->usage_lock, seq));
151
152         return ret;
153 }
154
155 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
156 {
157         unsigned u64s = fs_usage_u64s(c);
158
159         BUG_ON(idx >= ARRAY_SIZE(c->usage));
160
161         preempt_disable();
162         write_seqcount_begin(&c->usage_lock);
163
164         acc_u64s_percpu((u64 *) c->usage_base,
165                         (u64 __percpu *) c->usage[idx], u64s);
166         percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
167
168         rcu_read_lock();
169         for_each_member_device_rcu(c, ca, NULL) {
170                 u64s = dev_usage_u64s();
171
172                 acc_u64s_percpu((u64 *) ca->usage_base,
173                                 (u64 __percpu *) ca->usage[idx], u64s);
174                 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
175         }
176         rcu_read_unlock();
177
178         write_seqcount_end(&c->usage_lock);
179         preempt_enable();
180 }
181
182 void bch2_fs_usage_to_text(struct printbuf *out,
183                            struct bch_fs *c,
184                            struct bch_fs_usage_online *fs_usage)
185 {
186         unsigned i;
187
188         prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
189
190         prt_printf(out, "hidden:\t\t\t\t%llu\n",
191                fs_usage->u.b.hidden);
192         prt_printf(out, "data:\t\t\t\t%llu\n",
193                fs_usage->u.b.data);
194         prt_printf(out, "cached:\t\t\t\t%llu\n",
195                fs_usage->u.b.cached);
196         prt_printf(out, "reserved:\t\t\t%llu\n",
197                fs_usage->u.b.reserved);
198         prt_printf(out, "nr_inodes:\t\t\t%llu\n",
199                fs_usage->u.b.nr_inodes);
200         prt_printf(out, "online reserved:\t\t%llu\n",
201                fs_usage->online_reserved);
202
203         for (i = 0;
204              i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
205              i++) {
206                 prt_printf(out, "%u replicas:\n", i + 1);
207                 prt_printf(out, "\treserved:\t\t%llu\n",
208                        fs_usage->u.persistent_reserved[i]);
209         }
210
211         for (i = 0; i < c->replicas.nr; i++) {
212                 struct bch_replicas_entry_v1 *e =
213                         cpu_replicas_entry(&c->replicas, i);
214
215                 prt_printf(out, "\t");
216                 bch2_replicas_entry_to_text(out, e);
217                 prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
218         }
219 }
220
221 static u64 reserve_factor(u64 r)
222 {
223         return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
224 }
225
226 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
227 {
228         return min(fs_usage->u.b.hidden +
229                    fs_usage->u.b.btree +
230                    fs_usage->u.b.data +
231                    reserve_factor(fs_usage->u.b.reserved +
232                                   fs_usage->online_reserved),
233                    c->capacity);
234 }
235
236 static struct bch_fs_usage_short
237 __bch2_fs_usage_read_short(struct bch_fs *c)
238 {
239         struct bch_fs_usage_short ret;
240         u64 data, reserved;
241
242         ret.capacity = c->capacity -
243                 bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
244
245         data            = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
246                 bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
247         reserved        = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
248                 percpu_u64_get(c->online_reserved);
249
250         ret.used        = min(ret.capacity, data + reserve_factor(reserved));
251         ret.free        = ret.capacity - ret.used;
252
253         ret.nr_inodes   = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
254
255         return ret;
256 }
257
258 struct bch_fs_usage_short
259 bch2_fs_usage_read_short(struct bch_fs *c)
260 {
261         struct bch_fs_usage_short ret;
262
263         percpu_down_read(&c->mark_lock);
264         ret = __bch2_fs_usage_read_short(c);
265         percpu_up_read(&c->mark_lock);
266
267         return ret;
268 }
269
270 void bch2_dev_usage_init(struct bch_dev *ca)
271 {
272         ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
273 }
274
275 void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
276 {
277         prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
278
279         for (unsigned i = 0; i < BCH_DATA_NR; i++) {
280                 bch2_prt_data_type(out, i);
281                 prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
282                         usage->d[i].buckets,
283                         usage->d[i].sectors,
284                         usage->d[i].fragmented);
285         }
286 }
287
288 void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
289                            const struct bch_alloc_v4 *old,
290                            const struct bch_alloc_v4 *new,
291                            u64 journal_seq, bool gc)
292 {
293         struct bch_fs_usage *fs_usage;
294         struct bch_dev_usage *u;
295
296         preempt_disable();
297         fs_usage = fs_usage_ptr(c, journal_seq, gc);
298
299         if (data_type_is_hidden(old->data_type))
300                 fs_usage->b.hidden -= ca->mi.bucket_size;
301         if (data_type_is_hidden(new->data_type))
302                 fs_usage->b.hidden += ca->mi.bucket_size;
303
304         u = dev_usage_ptr(ca, journal_seq, gc);
305
306         u->d[old->data_type].buckets--;
307         u->d[new->data_type].buckets++;
308
309         u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
310         u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
311
312         u->d[BCH_DATA_cached].sectors += new->cached_sectors;
313         u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
314
315         u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
316         u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
317
318         preempt_enable();
319 }
320
321 static inline int __update_replicas(struct bch_fs *c,
322                                     struct bch_fs_usage *fs_usage,
323                                     struct bch_replicas_entry_v1 *r,
324                                     s64 sectors)
325 {
326         int idx = bch2_replicas_entry_idx(c, r);
327
328         if (idx < 0)
329                 return -1;
330
331         fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
332         fs_usage->replicas[idx]         += sectors;
333         return 0;
334 }
335
336 int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
337                          struct bch_replicas_entry_v1 *r, s64 sectors,
338                          unsigned journal_seq, bool gc)
339 {
340         struct bch_fs_usage *fs_usage;
341         int idx, ret = 0;
342         struct printbuf buf = PRINTBUF;
343
344         percpu_down_read(&c->mark_lock);
345
346         idx = bch2_replicas_entry_idx(c, r);
347         if (idx < 0 &&
348             fsck_err(c, ptr_to_missing_replicas_entry,
349                      "no replicas entry\n  while marking %s",
350                      (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
351                 percpu_up_read(&c->mark_lock);
352                 ret = bch2_mark_replicas(c, r);
353                 percpu_down_read(&c->mark_lock);
354
355                 if (ret)
356                         goto err;
357                 idx = bch2_replicas_entry_idx(c, r);
358         }
359         if (idx < 0) {
360                 ret = -1;
361                 goto err;
362         }
363
364         preempt_disable();
365         fs_usage = fs_usage_ptr(c, journal_seq, gc);
366         fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
367         fs_usage->replicas[idx]         += sectors;
368         preempt_enable();
369 err:
370 fsck_err:
371         percpu_up_read(&c->mark_lock);
372         printbuf_exit(&buf);
373         return ret;
374 }
375
376 static inline int update_cached_sectors(struct bch_fs *c,
377                         struct bkey_s_c k,
378                         unsigned dev, s64 sectors,
379                         unsigned journal_seq, bool gc)
380 {
381         struct bch_replicas_padded r;
382
383         bch2_replicas_entry_cached(&r.e, dev);
384
385         return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
386 }
387
388 static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
389                                      gfp_t gfp)
390 {
391         struct replicas_delta_list *d = trans->fs_usage_deltas;
392         unsigned new_size = d ? (d->size + more) * 2 : 128;
393         unsigned alloc_size = sizeof(*d) + new_size;
394
395         WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
396
397         if (!d || d->used + more > d->size) {
398                 d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
399
400                 if (unlikely(!d)) {
401                         if (alloc_size > REPLICAS_DELTA_LIST_MAX)
402                                 return -ENOMEM;
403
404                         d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
405                         if (!d)
406                                 return -ENOMEM;
407
408                         memset(d, 0, REPLICAS_DELTA_LIST_MAX);
409
410                         if (trans->fs_usage_deltas)
411                                 memcpy(d, trans->fs_usage_deltas,
412                                        trans->fs_usage_deltas->size + sizeof(*d));
413
414                         new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
415                         kfree(trans->fs_usage_deltas);
416                 }
417
418                 d->size = new_size;
419                 trans->fs_usage_deltas = d;
420         }
421
422         return 0;
423 }
424
425 int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
426 {
427         return allocate_dropping_locks_errcode(trans,
428                                 __replicas_deltas_realloc(trans, more, _gfp));
429 }
430
431 int bch2_update_replicas_list(struct btree_trans *trans,
432                          struct bch_replicas_entry_v1 *r,
433                          s64 sectors)
434 {
435         struct replicas_delta_list *d;
436         struct replicas_delta *n;
437         unsigned b;
438         int ret;
439
440         if (!sectors)
441                 return 0;
442
443         b = replicas_entry_bytes(r) + 8;
444         ret = bch2_replicas_deltas_realloc(trans, b);
445         if (ret)
446                 return ret;
447
448         d = trans->fs_usage_deltas;
449         n = (void *) d->d + d->used;
450         n->delta = sectors;
451         unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
452                       r, replicas_entry_bytes(r),
453                       "flexible array member embedded in strcuct with padding");
454         bch2_replicas_entry_sort(&n->r);
455         d->used += b;
456         return 0;
457 }
458
459 int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
460 {
461         struct bch_replicas_padded r;
462
463         bch2_replicas_entry_cached(&r.e, dev);
464
465         return bch2_update_replicas_list(trans, &r.e, sectors);
466 }
467
468 static int bch2_check_fix_ptr(struct btree_trans *trans,
469                               struct bkey_s_c k,
470                               struct extent_ptr_decoded p,
471                               const union bch_extent_entry *entry,
472                               bool *do_update)
473 {
474         struct bch_fs *c = trans->c;
475         struct printbuf buf = PRINTBUF;
476         int ret = 0;
477
478         struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
479         if (!ca) {
480                 if (fsck_err(c, ptr_to_invalid_device,
481                              "pointer to missing device %u\n"
482                              "while marking %s",
483                              p.ptr.dev,
484                              (printbuf_reset(&buf),
485                               bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
486                         *do_update = true;
487                 return 0;
488         }
489
490         struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
491         if (!g) {
492                 if (fsck_err(c, ptr_to_invalid_device,
493                              "pointer to invalid bucket on device %u\n"
494                              "while marking %s",
495                              p.ptr.dev,
496                              (printbuf_reset(&buf),
497                               bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
498                         *do_update = true;
499                 goto out;
500         }
501
502         enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
503
504         if (fsck_err_on(!g->gen_valid,
505                         c, ptr_to_missing_alloc_key,
506                         "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
507                         "while marking %s",
508                         p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
509                         bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
510                         p.ptr.gen,
511                         (printbuf_reset(&buf),
512                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
513                 if (!p.ptr.cached) {
514                         g->gen_valid            = true;
515                         g->gen                  = p.ptr.gen;
516                 } else {
517                         *do_update = true;
518                 }
519         }
520
521         if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
522                         c, ptr_gen_newer_than_bucket_gen,
523                         "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
524                         "while marking %s",
525                         p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
526                         bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
527                         p.ptr.gen, g->gen,
528                         (printbuf_reset(&buf),
529                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
530                 if (!p.ptr.cached &&
531                     (g->data_type != BCH_DATA_btree ||
532                      data_type == BCH_DATA_btree)) {
533                         g->gen_valid            = true;
534                         g->gen                  = p.ptr.gen;
535                         g->data_type            = 0;
536                         g->dirty_sectors        = 0;
537                         g->cached_sectors       = 0;
538                 } else {
539                         *do_update = true;
540                 }
541         }
542
543         if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
544                         c, ptr_gen_newer_than_bucket_gen,
545                         "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
546                         "while marking %s",
547                         p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
548                         bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
549                         p.ptr.gen,
550                         (printbuf_reset(&buf),
551                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
552                 *do_update = true;
553
554         if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
555                         c, stale_dirty_ptr,
556                         "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
557                         "while marking %s",
558                         p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
559                         bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
560                         p.ptr.gen, g->gen,
561                         (printbuf_reset(&buf),
562                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
563                 *do_update = true;
564
565         if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
566                 goto out;
567
568         if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
569                         c, ptr_bucket_data_type_mismatch,
570                         "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
571                         "while marking %s",
572                         p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
573                         bch2_data_type_str(g->data_type),
574                         bch2_data_type_str(data_type),
575                         (printbuf_reset(&buf),
576                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
577                 if (data_type == BCH_DATA_btree) {
578                         g->gen_valid            = true;
579                         g->gen                  = p.ptr.gen;
580                         g->data_type            = data_type;
581                         g->dirty_sectors        = 0;
582                         g->cached_sectors       = 0;
583                 } else {
584                         *do_update = true;
585                 }
586         }
587
588         if (p.has_ec) {
589                 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
590
591                 if (fsck_err_on(!m || !m->alive,
592                                 c, ptr_to_missing_stripe,
593                                 "pointer to nonexistent stripe %llu\n"
594                                 "while marking %s",
595                                 (u64) p.ec.idx,
596                                 (printbuf_reset(&buf),
597                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
598                         *do_update = true;
599
600                 if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
601                                 c, ptr_to_incorrect_stripe,
602                                 "pointer does not match stripe %llu\n"
603                                 "while marking %s",
604                                 (u64) p.ec.idx,
605                                 (printbuf_reset(&buf),
606                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
607                         *do_update = true;
608         }
609 out:
610 fsck_err:
611         bch2_dev_put(ca);
612         printbuf_exit(&buf);
613         return ret;
614 }
615
616 int bch2_check_fix_ptrs(struct btree_trans *trans,
617                         enum btree_id btree, unsigned level, struct bkey_s_c k,
618                         enum btree_iter_update_trigger_flags flags)
619 {
620         struct bch_fs *c = trans->c;
621         struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
622         const union bch_extent_entry *entry_c;
623         struct extent_ptr_decoded p = { 0 };
624         bool do_update = false;
625         struct printbuf buf = PRINTBUF;
626         int ret = 0;
627
628         percpu_down_read(&c->mark_lock);
629
630         bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
631                 ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
632                 if (ret)
633                         goto err;
634         }
635
636         if (do_update) {
637                 if (flags & BTREE_TRIGGER_is_root) {
638                         bch_err(c, "cannot update btree roots yet");
639                         ret = -EINVAL;
640                         goto err;
641                 }
642
643                 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
644                 ret = PTR_ERR_OR_ZERO(new);
645                 if (ret)
646                         goto err;
647
648                 rcu_read_lock();
649                 bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
650                 rcu_read_unlock();
651
652                 if (level) {
653                         /*
654                          * We don't want to drop btree node pointers - if the
655                          * btree node isn't there anymore, the read path will
656                          * sort it out:
657                          */
658                         struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
659                         rcu_read_lock();
660                         bkey_for_each_ptr(ptrs, ptr) {
661                                 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
662                                 struct bucket *g = PTR_GC_BUCKET(ca, ptr);
663
664                                 ptr->gen = g->gen;
665                         }
666                         rcu_read_unlock();
667                 } else {
668                         struct bkey_ptrs ptrs;
669                         union bch_extent_entry *entry;
670
671                         rcu_read_lock();
672 restart_drop_ptrs:
673                         ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
674                         bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
675                                 struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
676                                 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
677                                 enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
678
679                                 if ((p.ptr.cached &&
680                                      (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
681                                     (!p.ptr.cached &&
682                                      gen_cmp(p.ptr.gen, g->gen) < 0) ||
683                                     gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
684                                     (g->data_type &&
685                                      g->data_type != data_type)) {
686                                         bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
687                                         goto restart_drop_ptrs;
688                                 }
689                         }
690                         rcu_read_unlock();
691 again:
692                         ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
693                         bkey_extent_entry_for_each(ptrs, entry) {
694                                 if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
695                                         struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
696                                                                         entry->stripe_ptr.idx);
697                                         union bch_extent_entry *next_ptr;
698
699                                         bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
700                                                 if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
701                                                         goto found;
702                                         next_ptr = NULL;
703 found:
704                                         if (!next_ptr) {
705                                                 bch_err(c, "aieee, found stripe ptr with no data ptr");
706                                                 continue;
707                                         }
708
709                                         if (!m || !m->alive ||
710                                             !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
711                                                                        &next_ptr->ptr,
712                                                                        m->sectors)) {
713                                                 bch2_bkey_extent_entry_drop(new, entry);
714                                                 goto again;
715                                         }
716                                 }
717                         }
718                 }
719
720                 if (0) {
721                         printbuf_reset(&buf);
722                         bch2_bkey_val_to_text(&buf, c, k);
723                         bch_info(c, "updated %s", buf.buf);
724
725                         printbuf_reset(&buf);
726                         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
727                         bch_info(c, "new key %s", buf.buf);
728                 }
729
730                 percpu_up_read(&c->mark_lock);
731                 struct btree_iter iter;
732                 bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
733                                           BTREE_ITER_intent|BTREE_ITER_all_snapshots);
734                 ret =   bch2_btree_iter_traverse(&iter) ?:
735                         bch2_trans_update(trans, &iter, new,
736                                           BTREE_UPDATE_internal_snapshot_node|
737                                           BTREE_TRIGGER_norun);
738                 bch2_trans_iter_exit(trans, &iter);
739                 percpu_down_read(&c->mark_lock);
740
741                 if (ret)
742                         goto err;
743
744                 if (level)
745                         bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
746         }
747 err:
748         percpu_up_read(&c->mark_lock);
749         printbuf_exit(&buf);
750         return ret;
751 }
752
753 int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
754                            struct bkey_s_c k,
755                            const struct bch_extent_ptr *ptr,
756                            s64 sectors, enum bch_data_type ptr_data_type,
757                            u8 b_gen, u8 bucket_data_type,
758                            u32 *bucket_sectors)
759 {
760         struct bch_fs *c = trans->c;
761         size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
762         struct printbuf buf = PRINTBUF;
763         bool inserting = sectors > 0;
764         int ret = 0;
765
766         BUG_ON(!sectors);
767
768         if (gen_after(ptr->gen, b_gen)) {
769                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
770                               BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
771                         "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
772                         "while marking %s",
773                         ptr->dev, bucket_nr, b_gen,
774                         bch2_data_type_str(bucket_data_type ?: ptr_data_type),
775                         ptr->gen,
776                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
777                 if (inserting)
778                         goto err;
779                 goto out;
780         }
781
782         if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
783                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
784                               BCH_FSCK_ERR_ptr_too_stale,
785                         "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
786                         "while marking %s",
787                         ptr->dev, bucket_nr, b_gen,
788                         bch2_data_type_str(bucket_data_type ?: ptr_data_type),
789                         ptr->gen,
790                         (printbuf_reset(&buf),
791                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
792                 if (inserting)
793                         goto err;
794                 goto out;
795         }
796
797         if (b_gen != ptr->gen && ptr->cached) {
798                 ret = 1;
799                 goto out;
800         }
801
802         if (b_gen != ptr->gen) {
803                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
804                               BCH_FSCK_ERR_stale_dirty_ptr,
805                         "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
806                         "while marking %s",
807                         ptr->dev, bucket_nr, b_gen,
808                         bucket_gen_get(ca, bucket_nr),
809                         bch2_data_type_str(bucket_data_type ?: ptr_data_type),
810                         ptr->gen,
811                         (printbuf_reset(&buf),
812                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
813                 if (inserting)
814                         goto err;
815                 goto out;
816         }
817
818         if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
819                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
820                               BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
821                         "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
822                         "while marking %s",
823                         ptr->dev, bucket_nr, b_gen,
824                         bch2_data_type_str(bucket_data_type),
825                         bch2_data_type_str(ptr_data_type),
826                         (printbuf_reset(&buf),
827                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
828                 if (inserting)
829                         goto err;
830                 goto out;
831         }
832
833         if ((u64) *bucket_sectors + sectors > U32_MAX) {
834                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
835                               BCH_FSCK_ERR_bucket_sector_count_overflow,
836                         "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
837                         "while marking %s",
838                         ptr->dev, bucket_nr, b_gen,
839                         bch2_data_type_str(bucket_data_type ?: ptr_data_type),
840                         *bucket_sectors, sectors,
841                         (printbuf_reset(&buf),
842                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
843                 if (inserting)
844                         goto err;
845                 sectors = -*bucket_sectors;
846         }
847
848         *bucket_sectors += sectors;
849 out:
850         printbuf_exit(&buf);
851         return ret;
852 err:
853         bch2_dump_trans_updates(trans);
854         ret = -EIO;
855         goto out;
856 }
857
858 void bch2_trans_fs_usage_revert(struct btree_trans *trans,
859                                 struct replicas_delta_list *deltas)
860 {
861         struct bch_fs *c = trans->c;
862         struct bch_fs_usage *dst;
863         struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
864         s64 added = 0;
865         unsigned i;
866
867         percpu_down_read(&c->mark_lock);
868         preempt_disable();
869         dst = fs_usage_ptr(c, trans->journal_res.seq, false);
870
871         /* revert changes: */
872         for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
873                 switch (d->r.data_type) {
874                 case BCH_DATA_btree:
875                 case BCH_DATA_user:
876                 case BCH_DATA_parity:
877                         added += d->delta;
878                 }
879                 BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
880         }
881
882         dst->b.nr_inodes -= deltas->nr_inodes;
883
884         for (i = 0; i < BCH_REPLICAS_MAX; i++) {
885                 added                           -= deltas->persistent_reserved[i];
886                 dst->b.reserved                 -= deltas->persistent_reserved[i];
887                 dst->persistent_reserved[i]     -= deltas->persistent_reserved[i];
888         }
889
890         if (added > 0) {
891                 trans->disk_res->sectors += added;
892                 this_cpu_add(*c->online_reserved, added);
893         }
894
895         preempt_enable();
896         percpu_up_read(&c->mark_lock);
897 }
898
899 void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
900 {
901         struct bch_fs *c = trans->c;
902         u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
903         static int warned_disk_usage = 0;
904         bool warn = false;
905
906         percpu_down_read(&c->mark_lock);
907         preempt_disable();
908         struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
909         struct bch_fs_usage_base *src = &trans->fs_usage_delta;
910
911         s64 added = src->btree + src->data + src->reserved;
912
913         /*
914          * Not allowed to reduce sectors_available except by getting a
915          * reservation:
916          */
917         s64 should_not_have_added = added - (s64) disk_res_sectors;
918         if (unlikely(should_not_have_added > 0)) {
919                 u64 old, new, v = atomic64_read(&c->sectors_available);
920
921                 do {
922                         old = v;
923                         new = max_t(s64, 0, old - should_not_have_added);
924                 } while ((v = atomic64_cmpxchg(&c->sectors_available,
925                                                old, new)) != old);
926
927                 added -= should_not_have_added;
928                 warn = true;
929         }
930
931         if (added > 0) {
932                 trans->disk_res->sectors -= added;
933                 this_cpu_sub(*c->online_reserved, added);
934         }
935
936         dst->hidden     += src->hidden;
937         dst->btree      += src->btree;
938         dst->data       += src->data;
939         dst->cached     += src->cached;
940         dst->reserved   += src->reserved;
941         dst->nr_inodes  += src->nr_inodes;
942
943         preempt_enable();
944         percpu_up_read(&c->mark_lock);
945
946         if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
947                 bch2_trans_inconsistent(trans,
948                                         "disk usage increased %lli more than %llu sectors reserved)",
949                                         should_not_have_added, disk_res_sectors);
950 }
951
952 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
953                               struct replicas_delta_list *deltas)
954 {
955         struct bch_fs *c = trans->c;
956         struct replicas_delta *d, *d2;
957         struct replicas_delta *top = (void *) deltas->d + deltas->used;
958         struct bch_fs_usage *dst;
959         unsigned i;
960
961         percpu_down_read(&c->mark_lock);
962         preempt_disable();
963         dst = fs_usage_ptr(c, trans->journal_res.seq, false);
964
965         for (d = deltas->d; d != top; d = replicas_delta_next(d))
966                 if (__update_replicas(c, dst, &d->r, d->delta))
967                         goto need_mark;
968
969         dst->b.nr_inodes += deltas->nr_inodes;
970
971         for (i = 0; i < BCH_REPLICAS_MAX; i++) {
972                 dst->b.reserved                 += deltas->persistent_reserved[i];
973                 dst->persistent_reserved[i]     += deltas->persistent_reserved[i];
974         }
975
976         preempt_enable();
977         percpu_up_read(&c->mark_lock);
978         return 0;
979 need_mark:
980         /* revert changes: */
981         for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
982                 BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
983
984         preempt_enable();
985         percpu_up_read(&c->mark_lock);
986         return -1;
987 }
988
989 /* KEY_TYPE_extent: */
990
991 static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
992                           struct bkey_s_c k,
993                           const struct bch_extent_ptr *ptr,
994                           s64 sectors, enum bch_data_type ptr_data_type,
995                           struct bch_alloc_v4 *a)
996 {
997         u32 *dst_sectors = !ptr->cached
998                 ? &a->dirty_sectors
999                 : &a->cached_sectors;
1000         int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type,
1001                                          a->gen, a->data_type, dst_sectors);
1002
1003         if (ret)
1004                 return ret;
1005
1006         alloc_data_type_set(a, ptr_data_type);
1007         return 0;
1008 }
1009
1010 static int bch2_trigger_pointer(struct btree_trans *trans,
1011                         enum btree_id btree_id, unsigned level,
1012                         struct bkey_s_c k, struct extent_ptr_decoded p,
1013                         const union bch_extent_entry *entry,
1014                         s64 *sectors,
1015                         enum btree_iter_update_trigger_flags flags)
1016 {
1017         bool insert = !(flags & BTREE_TRIGGER_overwrite);
1018         struct printbuf buf = PRINTBUF;
1019         int ret = 0;
1020
1021         struct bch_fs *c = trans->c;
1022         struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
1023         if (unlikely(!ca)) {
1024                 if (insert)
1025                         ret = -EIO;
1026                 goto err;
1027         }
1028
1029         struct bpos bucket;
1030         struct bch_backpointer bp;
1031         bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
1032         *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
1033
1034         if (flags & BTREE_TRIGGER_transactional) {
1035                 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
1036                 ret = PTR_ERR_OR_ZERO(a) ?:
1037                         __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v);
1038                 if (ret)
1039                         goto err;
1040
1041                 if (!p.ptr.cached) {
1042                         ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
1043                         if (ret)
1044                                 goto err;
1045                 }
1046         }
1047
1048         if (flags & BTREE_TRIGGER_gc) {
1049                 percpu_down_read(&c->mark_lock);
1050                 struct bucket *g = gc_bucket(ca, bucket.offset);
1051                 if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
1052                                             p.ptr.dev,
1053                                             (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1054                         ret = -EIO;
1055                         goto err_unlock;
1056                 }
1057
1058                 bucket_lock(g);
1059                 struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
1060                 ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
1061                 if (!ret) {
1062                         alloc_to_bucket(g, new);
1063                         bch2_dev_usage_update(c, ca, &old, &new, 0, true);
1064                 }
1065                 bucket_unlock(g);
1066 err_unlock:
1067                 percpu_up_read(&c->mark_lock);
1068         }
1069 err:
1070         bch2_dev_put(ca);
1071         printbuf_exit(&buf);
1072         return ret;
1073 }
1074
1075 static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
1076                                 struct bkey_s_c k,
1077                                 struct extent_ptr_decoded p,
1078                                 enum bch_data_type data_type,
1079                                 s64 sectors,
1080                                 enum btree_iter_update_trigger_flags flags)
1081 {
1082         if (flags & BTREE_TRIGGER_transactional) {
1083                 struct btree_iter iter;
1084                 struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
1085                                 BTREE_ID_stripes, POS(0, p.ec.idx),
1086                                 BTREE_ITER_with_updates, stripe);
1087                 int ret = PTR_ERR_OR_ZERO(s);
1088                 if (unlikely(ret)) {
1089                         bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
1090                                 "pointer to nonexistent stripe %llu",
1091                                 (u64) p.ec.idx);
1092                         goto err;
1093                 }
1094
1095                 if (!bch2_ptr_matches_stripe(&s->v, p)) {
1096                         bch2_trans_inconsistent(trans,
1097                                 "stripe pointer doesn't match stripe %llu",
1098                                 (u64) p.ec.idx);
1099                         ret = -EIO;
1100                         goto err;
1101                 }
1102
1103                 stripe_blockcount_set(&s->v, p.ec.block,
1104                         stripe_blockcount_get(&s->v, p.ec.block) +
1105                         sectors);
1106
1107                 struct bch_replicas_padded r;
1108                 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
1109                 r.e.data_type = data_type;
1110                 ret = bch2_update_replicas_list(trans, &r.e, sectors);
1111 err:
1112                 bch2_trans_iter_exit(trans, &iter);
1113                 return ret;
1114         }
1115
1116         if (flags & BTREE_TRIGGER_gc) {
1117                 struct bch_fs *c = trans->c;
1118
1119                 BUG_ON(!(flags & BTREE_TRIGGER_gc));
1120
1121                 struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
1122                 if (!m) {
1123                         bch_err(c, "error allocating memory for gc_stripes, idx %llu",
1124                                 (u64) p.ec.idx);
1125                         return -BCH_ERR_ENOMEM_mark_stripe_ptr;
1126                 }
1127
1128                 mutex_lock(&c->ec_stripes_heap_lock);
1129
1130                 if (!m || !m->alive) {
1131                         mutex_unlock(&c->ec_stripes_heap_lock);
1132                         struct printbuf buf = PRINTBUF;
1133                         bch2_bkey_val_to_text(&buf, c, k);
1134                         bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n  while marking %s",
1135                                             (u64) p.ec.idx, buf.buf);
1136                         printbuf_exit(&buf);
1137                         bch2_inconsistent_error(c);
1138                         return -EIO;
1139                 }
1140
1141                 m->block_sectors[p.ec.block] += sectors;
1142
1143                 struct bch_replicas_padded r = m->r;
1144                 mutex_unlock(&c->ec_stripes_heap_lock);
1145
1146                 r.e.data_type = data_type;
1147                 bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
1148         }
1149
1150         return 0;
1151 }
1152
1153 static int __trigger_extent(struct btree_trans *trans,
1154                             enum btree_id btree_id, unsigned level,
1155                             struct bkey_s_c k,
1156                             enum btree_iter_update_trigger_flags flags)
1157 {
1158         bool gc = flags & BTREE_TRIGGER_gc;
1159         struct bch_fs *c = trans->c;
1160         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1161         const union bch_extent_entry *entry;
1162         struct extent_ptr_decoded p;
1163         struct bch_replicas_padded r;
1164         enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
1165                 ? BCH_DATA_btree
1166                 : BCH_DATA_user;
1167         s64 replicas_sectors = 0;
1168         int ret = 0;
1169
1170         r.e.data_type   = data_type;
1171         r.e.nr_devs     = 0;
1172         r.e.nr_required = 1;
1173
1174         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1175                 s64 disk_sectors = 0;
1176                 ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
1177                 if (ret < 0)
1178                         return ret;
1179
1180                 bool stale = ret > 0;
1181
1182                 if (p.ptr.cached) {
1183                         if (!stale) {
1184                                 ret = !gc
1185                                         ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
1186                                         : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
1187                                 bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
1188                                                      bch2_err_str(ret));
1189                                 if (ret)
1190                                         return ret;
1191                         }
1192                 } else if (!p.has_ec) {
1193                         replicas_sectors       += disk_sectors;
1194                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
1195                 } else {
1196                         ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
1197                         if (ret)
1198                                 return ret;
1199
1200                         /*
1201                          * There may be other dirty pointers in this extent, but
1202                          * if so they're not required for mounting if we have an
1203                          * erasure coded pointer in this extent:
1204                          */
1205                         r.e.nr_required = 0;
1206                 }
1207         }
1208
1209         if (r.e.nr_devs) {
1210                 ret = !gc
1211                         ? bch2_update_replicas_list(trans, &r.e, replicas_sectors)
1212                         : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true);
1213                 if (unlikely(ret && gc)) {
1214                         struct printbuf buf = PRINTBUF;
1215
1216                         bch2_bkey_val_to_text(&buf, c, k);
1217                         bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
1218                         printbuf_exit(&buf);
1219                 }
1220                 if (ret)
1221                         return ret;
1222         }
1223
1224         return 0;
1225 }
1226
1227 int bch2_trigger_extent(struct btree_trans *trans,
1228                         enum btree_id btree, unsigned level,
1229                         struct bkey_s_c old, struct bkey_s new,
1230                         enum btree_iter_update_trigger_flags flags)
1231 {
1232         struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
1233         struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
1234         unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
1235         unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
1236
1237         if (unlikely(flags & BTREE_TRIGGER_check_repair))
1238                 return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
1239
1240         /* if pointers aren't changing - nothing to do: */
1241         if (new_ptrs_bytes == old_ptrs_bytes &&
1242             !memcmp(new_ptrs.start,
1243                     old_ptrs.start,
1244                     new_ptrs_bytes))
1245                 return 0;
1246
1247         if (flags & BTREE_TRIGGER_transactional) {
1248                 struct bch_fs *c = trans->c;
1249                 int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
1250                           (int) bch2_bkey_needs_rebalance(c, old);
1251
1252                 if (mod) {
1253                         int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
1254                                                               new.k->p, mod > 0);
1255                         if (ret)
1256                                 return ret;
1257                 }
1258         }
1259
1260         if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
1261                 return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
1262
1263         return 0;
1264 }
1265
1266 /* KEY_TYPE_reservation */
1267
1268 static int __trigger_reservation(struct btree_trans *trans,
1269                         enum btree_id btree_id, unsigned level, struct bkey_s_c k,
1270                         enum btree_iter_update_trigger_flags flags)
1271 {
1272         struct bch_fs *c = trans->c;
1273         unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1274         s64 sectors = (s64) k.k->size * replicas;
1275
1276         if (flags & BTREE_TRIGGER_overwrite)
1277                 sectors = -sectors;
1278
1279         if (flags & BTREE_TRIGGER_transactional) {
1280                 int ret = bch2_replicas_deltas_realloc(trans, 0);
1281                 if (ret)
1282                         return ret;
1283
1284                 struct replicas_delta_list *d = trans->fs_usage_deltas;
1285                 replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
1286
1287                 d->persistent_reserved[replicas - 1] += sectors;
1288         }
1289
1290         if (flags & BTREE_TRIGGER_gc) {
1291                 percpu_down_read(&c->mark_lock);
1292                 preempt_disable();
1293
1294                 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
1295
1296                 replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
1297                 fs_usage->b.reserved                            += sectors;
1298                 fs_usage->persistent_reserved[replicas - 1]     += sectors;
1299
1300                 preempt_enable();
1301                 percpu_up_read(&c->mark_lock);
1302         }
1303
1304         return 0;
1305 }
1306
1307 int bch2_trigger_reservation(struct btree_trans *trans,
1308                           enum btree_id btree_id, unsigned level,
1309                           struct bkey_s_c old, struct bkey_s new,
1310                           enum btree_iter_update_trigger_flags flags)
1311 {
1312         return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
1313 }
1314
1315 /* Mark superblocks: */
1316
1317 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1318                                     struct bch_dev *ca, u64 b,
1319                                     enum bch_data_type type,
1320                                     unsigned sectors)
1321 {
1322         struct bch_fs *c = trans->c;
1323         struct btree_iter iter;
1324         int ret = 0;
1325
1326         struct bkey_i_alloc_v4 *a =
1327                 bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
1328         if (IS_ERR(a))
1329                 return PTR_ERR(a);
1330
1331         if (a->v.data_type && type && a->v.data_type != type) {
1332                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
1333                               BCH_FSCK_ERR_bucket_metadata_type_mismatch,
1334                         "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
1335                         "while marking %s",
1336                         iter.pos.inode, iter.pos.offset, a->v.gen,
1337                         bch2_data_type_str(a->v.data_type),
1338                         bch2_data_type_str(type),
1339                         bch2_data_type_str(type));
1340                 ret = -EIO;
1341                 goto err;
1342         }
1343
1344         if (a->v.data_type      != type ||
1345             a->v.dirty_sectors  != sectors) {
1346                 a->v.data_type          = type;
1347                 a->v.dirty_sectors      = sectors;
1348                 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1349         }
1350 err:
1351         bch2_trans_iter_exit(trans, &iter);
1352         return ret;
1353 }
1354
1355 static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
1356                         u64 b, enum bch_data_type data_type, unsigned sectors,
1357                         enum btree_iter_update_trigger_flags flags)
1358 {
1359         percpu_down_read(&c->mark_lock);
1360         struct bucket *g = gc_bucket(ca, b);
1361         if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
1362                                     ca->dev_idx, bch2_data_type_str(data_type)))
1363                 goto err_unlock;
1364
1365         bucket_lock(g);
1366         struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
1367
1368         if (bch2_fs_inconsistent_on(g->data_type &&
1369                         g->data_type != data_type, c,
1370                         "different types of data in same bucket: %s, %s",
1371                         bch2_data_type_str(g->data_type),
1372                         bch2_data_type_str(data_type)))
1373                 goto err;
1374
1375         if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
1376                         "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
1377                         ca->dev_idx, b, g->gen,
1378                         bch2_data_type_str(g->data_type ?: data_type),
1379                         g->dirty_sectors, sectors))
1380                 goto err;
1381
1382         g->data_type = data_type;
1383         g->dirty_sectors += sectors;
1384         struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
1385         bch2_dev_usage_update(c, ca, &old, &new, 0, true);
1386         percpu_up_read(&c->mark_lock);
1387         return 0;
1388 err:
1389         bucket_unlock(g);
1390 err_unlock:
1391         percpu_up_read(&c->mark_lock);
1392         return -EIO;
1393 }
1394
1395 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1396                         struct bch_dev *ca, u64 b,
1397                         enum bch_data_type type, unsigned sectors,
1398                         enum btree_iter_update_trigger_flags flags)
1399 {
1400         BUG_ON(type != BCH_DATA_free &&
1401                type != BCH_DATA_sb &&
1402                type != BCH_DATA_journal);
1403
1404         /*
1405          * Backup superblock might be past the end of our normal usable space:
1406          */
1407         if (b >= ca->mi.nbuckets)
1408                 return 0;
1409
1410         if (flags & BTREE_TRIGGER_gc)
1411                 return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags);
1412         else if (flags & BTREE_TRIGGER_transactional)
1413                 return commit_do(trans, NULL, NULL, 0,
1414                                  __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
1415         else
1416                 BUG();
1417 }
1418
1419 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
1420                         struct bch_dev *ca, u64 start, u64 end,
1421                         enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
1422                         enum btree_iter_update_trigger_flags flags)
1423 {
1424         do {
1425                 u64 b = sector_to_bucket(ca, start);
1426                 unsigned sectors =
1427                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1428
1429                 if (b != *bucket && *bucket_sectors) {
1430                         int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
1431                                                         type, *bucket_sectors, flags);
1432                         if (ret)
1433                                 return ret;
1434
1435                         *bucket_sectors = 0;
1436                 }
1437
1438                 *bucket         = b;
1439                 *bucket_sectors += sectors;
1440                 start += sectors;
1441         } while (start < end);
1442
1443         return 0;
1444 }
1445
1446 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
1447                         enum btree_iter_update_trigger_flags flags)
1448 {
1449         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
1450         u64 bucket = 0;
1451         unsigned i, bucket_sectors = 0;
1452         int ret;
1453
1454         for (i = 0; i < layout->nr_superblocks; i++) {
1455                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
1456
1457                 if (offset == BCH_SB_SECTOR) {
1458                         ret = bch2_trans_mark_metadata_sectors(trans, ca,
1459                                                 0, BCH_SB_SECTOR,
1460                                                 BCH_DATA_sb, &bucket, &bucket_sectors, flags);
1461                         if (ret)
1462                                 return ret;
1463                 }
1464
1465                 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
1466                                       offset + (1 << layout->sb_max_size_bits),
1467                                       BCH_DATA_sb, &bucket, &bucket_sectors, flags);
1468                 if (ret)
1469                         return ret;
1470         }
1471
1472         if (bucket_sectors) {
1473                 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1474                                 bucket, BCH_DATA_sb, bucket_sectors, flags);
1475                 if (ret)
1476                         return ret;
1477         }
1478
1479         for (i = 0; i < ca->journal.nr; i++) {
1480                 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1481                                 ca->journal.buckets[i],
1482                                 BCH_DATA_journal, ca->mi.bucket_size, flags);
1483                 if (ret)
1484                         return ret;
1485         }
1486
1487         return 0;
1488 }
1489
1490 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
1491                         enum btree_iter_update_trigger_flags flags)
1492 {
1493         int ret = bch2_trans_run(c,
1494                 __bch2_trans_mark_dev_sb(trans, ca, flags));
1495         bch_err_fn(c, ret);
1496         return ret;
1497 }
1498
1499 int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
1500                         enum btree_iter_update_trigger_flags flags)
1501 {
1502         for_each_online_member(c, ca) {
1503                 int ret = bch2_trans_mark_dev_sb(c, ca, flags);
1504                 if (ret) {
1505                         percpu_ref_put(&ca->io_ref);
1506                         return ret;
1507                 }
1508         }
1509
1510         return 0;
1511 }
1512
1513 int bch2_trans_mark_dev_sbs(struct bch_fs *c)
1514 {
1515         return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
1516 }
1517
1518 /* Disk reservations: */
1519
1520 #define SECTORS_CACHE   1024
1521
1522 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1523                               u64 sectors, int flags)
1524 {
1525         struct bch_fs_pcpu *pcpu;
1526         u64 old, v, get;
1527         s64 sectors_available;
1528         int ret;
1529
1530         percpu_down_read(&c->mark_lock);
1531         preempt_disable();
1532         pcpu = this_cpu_ptr(c->pcpu);
1533
1534         if (sectors <= pcpu->sectors_available)
1535                 goto out;
1536
1537         v = atomic64_read(&c->sectors_available);
1538         do {
1539                 old = v;
1540                 get = min((u64) sectors + SECTORS_CACHE, old);
1541
1542                 if (get < sectors) {
1543                         preempt_enable();
1544                         goto recalculate;
1545                 }
1546         } while ((v = atomic64_cmpxchg(&c->sectors_available,
1547                                        old, old - get)) != old);
1548
1549         pcpu->sectors_available         += get;
1550
1551 out:
1552         pcpu->sectors_available         -= sectors;
1553         this_cpu_add(*c->online_reserved, sectors);
1554         res->sectors                    += sectors;
1555
1556         preempt_enable();
1557         percpu_up_read(&c->mark_lock);
1558         return 0;
1559
1560 recalculate:
1561         mutex_lock(&c->sectors_available_lock);
1562
1563         percpu_u64_set(&c->pcpu->sectors_available, 0);
1564         sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
1565
1566         if (sectors <= sectors_available ||
1567             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1568                 atomic64_set(&c->sectors_available,
1569                              max_t(s64, 0, sectors_available - sectors));
1570                 this_cpu_add(*c->online_reserved, sectors);
1571                 res->sectors                    += sectors;
1572                 ret = 0;
1573         } else {
1574                 atomic64_set(&c->sectors_available, sectors_available);
1575                 ret = -BCH_ERR_ENOSPC_disk_reservation;
1576         }
1577
1578         mutex_unlock(&c->sectors_available_lock);
1579         percpu_up_read(&c->mark_lock);
1580
1581         return ret;
1582 }
1583
1584 /* Startup/shutdown: */
1585
1586 void bch2_buckets_nouse_free(struct bch_fs *c)
1587 {
1588         for_each_member_device(c, ca) {
1589                 kvfree_rcu_mightsleep(ca->buckets_nouse);
1590                 ca->buckets_nouse = NULL;
1591         }
1592 }
1593
1594 int bch2_buckets_nouse_alloc(struct bch_fs *c)
1595 {
1596         for_each_member_device(c, ca) {
1597                 BUG_ON(ca->buckets_nouse);
1598
1599                 ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
1600                                             sizeof(unsigned long),
1601                                             GFP_KERNEL|__GFP_ZERO);
1602                 if (!ca->buckets_nouse) {
1603                         bch2_dev_put(ca);
1604                         return -BCH_ERR_ENOMEM_buckets_nouse;
1605                 }
1606         }
1607
1608         return 0;
1609 }
1610
1611 static void bucket_gens_free_rcu(struct rcu_head *rcu)
1612 {
1613         struct bucket_gens *buckets =
1614                 container_of(rcu, struct bucket_gens, rcu);
1615
1616         kvfree(buckets);
1617 }
1618
1619 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1620 {
1621         struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
1622         bool resize = ca->bucket_gens != NULL;
1623         int ret;
1624
1625         BUG_ON(resize && ca->buckets_nouse);
1626
1627         if (!(bucket_gens       = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
1628                                            GFP_KERNEL|__GFP_ZERO))) {
1629                 ret = -BCH_ERR_ENOMEM_bucket_gens;
1630                 goto err;
1631         }
1632
1633         bucket_gens->first_bucket = ca->mi.first_bucket;
1634         bucket_gens->nbuckets   = nbuckets;
1635         bucket_gens->nbuckets_minus_first =
1636                 bucket_gens->nbuckets - bucket_gens->first_bucket;
1637
1638         if (resize) {
1639                 down_write(&c->gc_lock);
1640                 down_write(&ca->bucket_lock);
1641                 percpu_down_write(&c->mark_lock);
1642         }
1643
1644         old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
1645
1646         if (resize) {
1647                 size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
1648
1649                 memcpy(bucket_gens->b,
1650                        old_bucket_gens->b,
1651                        n);
1652         }
1653
1654         rcu_assign_pointer(ca->bucket_gens, bucket_gens);
1655         bucket_gens     = old_bucket_gens;
1656
1657         nbuckets = ca->mi.nbuckets;
1658
1659         if (resize) {
1660                 percpu_up_write(&c->mark_lock);
1661                 up_write(&ca->bucket_lock);
1662                 up_write(&c->gc_lock);
1663         }
1664
1665         ret = 0;
1666 err:
1667         if (bucket_gens)
1668                 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
1669
1670         return ret;
1671 }
1672
1673 void bch2_dev_buckets_free(struct bch_dev *ca)
1674 {
1675         kvfree(ca->buckets_nouse);
1676         kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
1677
1678         for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
1679                 free_percpu(ca->usage[i]);
1680         kfree(ca->usage_base);
1681 }
1682
1683 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1684 {
1685         ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
1686         if (!ca->usage_base)
1687                 return -BCH_ERR_ENOMEM_usage_init;
1688
1689         for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
1690                 ca->usage[i] = alloc_percpu(struct bch_dev_usage);
1691                 if (!ca->usage[i])
1692                         return -BCH_ERR_ENOMEM_usage_init;
1693         }
1694
1695         return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
1696 }
This page took 0.132955 seconds and 4 git commands to generate.