]> Git Repo - linux.git/blob - fs/bcachefs/alloc_background.c
Linux 6.14-rc3
[linux.git] / fs / bcachefs / alloc_background.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "bkey_buf.h"
7 #include "btree_cache.h"
8 #include "btree_io.h"
9 #include "btree_key_cache.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_gc.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "buckets_waiting_for_journal.h"
16 #include "clock.h"
17 #include "debug.h"
18 #include "disk_accounting.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "lru.h"
22 #include "recovery.h"
23 #include "trace.h"
24 #include "varint.h"
25
26 #include <linux/kthread.h>
27 #include <linux/math64.h>
28 #include <linux/random.h>
29 #include <linux/rculist.h>
30 #include <linux/rcupdate.h>
31 #include <linux/sched/task.h>
32 #include <linux/sort.h>
33 #include <linux/jiffies.h>
34
35 static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
36
37 /* Persistent alloc info: */
38
39 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
40 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
41         BCH_ALLOC_FIELDS_V1()
42 #undef x
43 };
44
45 struct bkey_alloc_unpacked {
46         u64             journal_seq;
47         u8              gen;
48         u8              oldest_gen;
49         u8              data_type;
50         bool            need_discard:1;
51         bool            need_inc_gen:1;
52 #define x(_name, _bits) u##_bits _name;
53         BCH_ALLOC_FIELDS_V2()
54 #undef  x
55 };
56
57 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
58                                      const void **p, unsigned field)
59 {
60         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
61         u64 v;
62
63         if (!(a->fields & (1 << field)))
64                 return 0;
65
66         switch (bytes) {
67         case 1:
68                 v = *((const u8 *) *p);
69                 break;
70         case 2:
71                 v = le16_to_cpup(*p);
72                 break;
73         case 4:
74                 v = le32_to_cpup(*p);
75                 break;
76         case 8:
77                 v = le64_to_cpup(*p);
78                 break;
79         default:
80                 BUG();
81         }
82
83         *p += bytes;
84         return v;
85 }
86
87 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
88                                  struct bkey_s_c k)
89 {
90         const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
91         const void *d = in->data;
92         unsigned idx = 0;
93
94         out->gen = in->gen;
95
96 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
97         BCH_ALLOC_FIELDS_V1()
98 #undef  x
99 }
100
101 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
102                                 struct bkey_s_c k)
103 {
104         struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
105         const u8 *in = a.v->data;
106         const u8 *end = bkey_val_end(a);
107         unsigned fieldnr = 0;
108         int ret;
109         u64 v;
110
111         out->gen        = a.v->gen;
112         out->oldest_gen = a.v->oldest_gen;
113         out->data_type  = a.v->data_type;
114
115 #define x(_name, _bits)                                                 \
116         if (fieldnr < a.v->nr_fields) {                                 \
117                 ret = bch2_varint_decode_fast(in, end, &v);             \
118                 if (ret < 0)                                            \
119                         return ret;                                     \
120                 in += ret;                                              \
121         } else {                                                        \
122                 v = 0;                                                  \
123         }                                                               \
124         out->_name = v;                                                 \
125         if (v != out->_name)                                            \
126                 return -1;                                              \
127         fieldnr++;
128
129         BCH_ALLOC_FIELDS_V2()
130 #undef  x
131         return 0;
132 }
133
134 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
135                                 struct bkey_s_c k)
136 {
137         struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
138         const u8 *in = a.v->data;
139         const u8 *end = bkey_val_end(a);
140         unsigned fieldnr = 0;
141         int ret;
142         u64 v;
143
144         out->gen        = a.v->gen;
145         out->oldest_gen = a.v->oldest_gen;
146         out->data_type  = a.v->data_type;
147         out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
148         out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
149         out->journal_seq = le64_to_cpu(a.v->journal_seq);
150
151 #define x(_name, _bits)                                                 \
152         if (fieldnr < a.v->nr_fields) {                                 \
153                 ret = bch2_varint_decode_fast(in, end, &v);             \
154                 if (ret < 0)                                            \
155                         return ret;                                     \
156                 in += ret;                                              \
157         } else {                                                        \
158                 v = 0;                                                  \
159         }                                                               \
160         out->_name = v;                                                 \
161         if (v != out->_name)                                            \
162                 return -1;                                              \
163         fieldnr++;
164
165         BCH_ALLOC_FIELDS_V2()
166 #undef  x
167         return 0;
168 }
169
170 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
171 {
172         struct bkey_alloc_unpacked ret = { .gen = 0 };
173
174         switch (k.k->type) {
175         case KEY_TYPE_alloc:
176                 bch2_alloc_unpack_v1(&ret, k);
177                 break;
178         case KEY_TYPE_alloc_v2:
179                 bch2_alloc_unpack_v2(&ret, k);
180                 break;
181         case KEY_TYPE_alloc_v3:
182                 bch2_alloc_unpack_v3(&ret, k);
183                 break;
184         }
185
186         return ret;
187 }
188
189 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
190 {
191         unsigned i, bytes = offsetof(struct bch_alloc, data);
192
193         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
194                 if (a->fields & (1 << i))
195                         bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
196
197         return DIV_ROUND_UP(bytes, sizeof(u64));
198 }
199
200 int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
201                            struct bkey_validate_context from)
202 {
203         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
204         int ret = 0;
205
206         /* allow for unknown fields */
207         bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
208                          c, alloc_v1_val_size_bad,
209                          "incorrect value size (%zu < %u)",
210                          bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
211 fsck_err:
212         return ret;
213 }
214
215 int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
216                            struct bkey_validate_context from)
217 {
218         struct bkey_alloc_unpacked u;
219         int ret = 0;
220
221         bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
222                          c, alloc_v2_unpack_error,
223                          "unpack error");
224 fsck_err:
225         return ret;
226 }
227
228 int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
229                            struct bkey_validate_context from)
230 {
231         struct bkey_alloc_unpacked u;
232         int ret = 0;
233
234         bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
235                          c, alloc_v2_unpack_error,
236                          "unpack error");
237 fsck_err:
238         return ret;
239 }
240
241 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
242                            struct bkey_validate_context from)
243 {
244         struct bch_alloc_v4 a;
245         int ret = 0;
246
247         bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
248
249         bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
250                          c, alloc_v4_val_size_bad,
251                          "bad val size (%u > %zu)",
252                          alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
253
254         bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
255                          BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
256                          c, alloc_v4_backpointers_start_bad,
257                          "invalid backpointers_start");
258
259         bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
260                          c, alloc_key_data_type_bad,
261                          "invalid data type (got %u should be %u)",
262                          a.data_type, alloc_data_type(a, a.data_type));
263
264         for (unsigned i = 0; i < 2; i++)
265                 bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
266                                  c, alloc_key_io_time_bad,
267                                  "invalid io_time[%s]: %llu, max %llu",
268                                  i == READ ? "read" : "write",
269                                  a.io_time[i], LRU_TIME_MAX);
270
271         unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
272                 offsetof(struct bch_alloc_v4, stripe_sectors)
273                 ? a.stripe_sectors
274                 : 0;
275
276         switch (a.data_type) {
277         case BCH_DATA_free:
278         case BCH_DATA_need_gc_gens:
279         case BCH_DATA_need_discard:
280                 bkey_fsck_err_on(stripe_sectors ||
281                                  a.dirty_sectors ||
282                                  a.cached_sectors ||
283                                  a.stripe,
284                                  c, alloc_key_empty_but_have_data,
285                                  "empty data type free but have data %u.%u.%u %u",
286                                  stripe_sectors,
287                                  a.dirty_sectors,
288                                  a.cached_sectors,
289                                  a.stripe);
290                 break;
291         case BCH_DATA_sb:
292         case BCH_DATA_journal:
293         case BCH_DATA_btree:
294         case BCH_DATA_user:
295         case BCH_DATA_parity:
296                 bkey_fsck_err_on(!a.dirty_sectors &&
297                                  !stripe_sectors,
298                                  c, alloc_key_dirty_sectors_0,
299                                  "data_type %s but dirty_sectors==0",
300                                  bch2_data_type_str(a.data_type));
301                 break;
302         case BCH_DATA_cached:
303                 bkey_fsck_err_on(!a.cached_sectors ||
304                                  a.dirty_sectors ||
305                                  stripe_sectors ||
306                                  a.stripe,
307                                  c, alloc_key_cached_inconsistency,
308                                  "data type inconsistency");
309
310                 bkey_fsck_err_on(!a.io_time[READ] &&
311                                  c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
312                                  c, alloc_key_cached_but_read_time_zero,
313                                  "cached bucket with read_time == 0");
314                 break;
315         case BCH_DATA_stripe:
316                 break;
317         }
318 fsck_err:
319         return ret;
320 }
321
322 void bch2_alloc_v4_swab(struct bkey_s k)
323 {
324         struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
325
326         a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
327         a->journal_seq_empty    = swab64(a->journal_seq_empty);
328         a->flags                = swab32(a->flags);
329         a->dirty_sectors        = swab32(a->dirty_sectors);
330         a->cached_sectors       = swab32(a->cached_sectors);
331         a->io_time[0]           = swab64(a->io_time[0]);
332         a->io_time[1]           = swab64(a->io_time[1]);
333         a->stripe               = swab32(a->stripe);
334         a->nr_external_backpointers = swab32(a->nr_external_backpointers);
335         a->stripe_sectors       = swab32(a->stripe_sectors);
336 }
337
338 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
339 {
340         struct bch_alloc_v4 _a;
341         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
342         struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
343
344         prt_newline(out);
345         printbuf_indent_add(out, 2);
346
347         prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
348         bch2_prt_data_type(out, a->data_type);
349         prt_newline(out);
350         prt_printf(out, "journal_seq_nonempty %llu\n",  a->journal_seq_nonempty);
351         prt_printf(out, "journal_seq_empty    %llu\n",  a->journal_seq_empty);
352         prt_printf(out, "need_discard         %llu\n",  BCH_ALLOC_V4_NEED_DISCARD(a));
353         prt_printf(out, "need_inc_gen         %llu\n",  BCH_ALLOC_V4_NEED_INC_GEN(a));
354         prt_printf(out, "dirty_sectors        %u\n",    a->dirty_sectors);
355         prt_printf(out, "stripe_sectors       %u\n",    a->stripe_sectors);
356         prt_printf(out, "cached_sectors       %u\n",    a->cached_sectors);
357         prt_printf(out, "stripe               %u\n",    a->stripe);
358         prt_printf(out, "stripe_redundancy    %u\n",    a->stripe_redundancy);
359         prt_printf(out, "io_time[READ]        %llu\n",  a->io_time[READ]);
360         prt_printf(out, "io_time[WRITE]       %llu\n",  a->io_time[WRITE]);
361
362         if (ca)
363                 prt_printf(out, "fragmentation     %llu\n",     alloc_lru_idx_fragmentation(*a, ca));
364         prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
365         printbuf_indent_sub(out, 2);
366
367         bch2_dev_put(ca);
368 }
369
370 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
371 {
372         if (k.k->type == KEY_TYPE_alloc_v4) {
373                 void *src, *dst;
374
375                 *out = *bkey_s_c_to_alloc_v4(k).v;
376
377                 src = alloc_v4_backpointers(out);
378                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
379                 dst = alloc_v4_backpointers(out);
380
381                 if (src < dst)
382                         memset(src, 0, dst - src);
383
384                 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
385         } else {
386                 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
387
388                 *out = (struct bch_alloc_v4) {
389                         .journal_seq_nonempty   = u.journal_seq,
390                         .flags                  = u.need_discard,
391                         .gen                    = u.gen,
392                         .oldest_gen             = u.oldest_gen,
393                         .data_type              = u.data_type,
394                         .stripe_redundancy      = u.stripe_redundancy,
395                         .dirty_sectors          = u.dirty_sectors,
396                         .cached_sectors         = u.cached_sectors,
397                         .io_time[READ]          = u.read_time,
398                         .io_time[WRITE]         = u.write_time,
399                         .stripe                 = u.stripe,
400                 };
401
402                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
403         }
404 }
405
406 static noinline struct bkey_i_alloc_v4 *
407 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
408 {
409         struct bkey_i_alloc_v4 *ret;
410
411         ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
412         if (IS_ERR(ret))
413                 return ret;
414
415         if (k.k->type == KEY_TYPE_alloc_v4) {
416                 void *src, *dst;
417
418                 bkey_reassemble(&ret->k_i, k);
419
420                 src = alloc_v4_backpointers(&ret->v);
421                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
422                 dst = alloc_v4_backpointers(&ret->v);
423
424                 if (src < dst)
425                         memset(src, 0, dst - src);
426
427                 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
428                 set_alloc_v4_u64s(ret);
429         } else {
430                 bkey_alloc_v4_init(&ret->k_i);
431                 ret->k.p = k.k->p;
432                 bch2_alloc_to_v4(k, &ret->v);
433         }
434         return ret;
435 }
436
437 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
438 {
439         struct bkey_s_c_alloc_v4 a;
440
441         if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
442             ((a = bkey_s_c_to_alloc_v4(k), true) &&
443              BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
444                 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
445
446         return __bch2_alloc_to_v4_mut(trans, k);
447 }
448
449 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
450 {
451         return bch2_alloc_to_v4_mut_inlined(trans, k);
452 }
453
454 struct bkey_i_alloc_v4 *
455 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
456                                        struct bpos pos)
457 {
458         struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
459                                                BTREE_ITER_with_updates|
460                                                BTREE_ITER_cached|
461                                                BTREE_ITER_intent);
462         int ret = bkey_err(k);
463         if (unlikely(ret))
464                 return ERR_PTR(ret);
465
466         struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
467         ret = PTR_ERR_OR_ZERO(a);
468         if (unlikely(ret))
469                 goto err;
470         return a;
471 err:
472         bch2_trans_iter_exit(trans, iter);
473         return ERR_PTR(ret);
474 }
475
476 __flatten
477 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
478                                                       enum btree_iter_update_trigger_flags flags)
479 {
480         struct btree_iter iter;
481         struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
482         int ret = PTR_ERR_OR_ZERO(a);
483         if (ret)
484                 return ERR_PTR(ret);
485
486         ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
487         bch2_trans_iter_exit(trans, &iter);
488         return unlikely(ret) ? ERR_PTR(ret) : a;
489 }
490
491 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
492 {
493         *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
494
495         pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
496         return pos;
497 }
498
499 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
500 {
501         pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
502         pos.offset += offset;
503         return pos;
504 }
505
506 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
507 {
508         return k.k->type == KEY_TYPE_bucket_gens
509                 ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
510                 : 0;
511 }
512
513 int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
514                               struct bkey_validate_context from)
515 {
516         int ret = 0;
517
518         bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
519                          c, bucket_gens_val_size_bad,
520                          "bad val size (%zu != %zu)",
521                          bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
522 fsck_err:
523         return ret;
524 }
525
526 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
527 {
528         struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
529         unsigned i;
530
531         for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
532                 if (i)
533                         prt_char(out, ' ');
534                 prt_printf(out, "%u", g.v->gens[i]);
535         }
536 }
537
538 int bch2_bucket_gens_init(struct bch_fs *c)
539 {
540         struct btree_trans *trans = bch2_trans_get(c);
541         struct bkey_i_bucket_gens g;
542         bool have_bucket_gens_key = false;
543         int ret;
544
545         ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
546                                  BTREE_ITER_prefetch, k, ({
547                 /*
548                  * Not a fsck error because this is checked/repaired by
549                  * bch2_check_alloc_key() which runs later:
550                  */
551                 if (!bch2_dev_bucket_exists(c, k.k->p))
552                         continue;
553
554                 struct bch_alloc_v4 a;
555                 u8 gen = bch2_alloc_to_v4(k, &a)->gen;
556                 unsigned offset;
557                 struct bpos pos = alloc_gens_pos(iter.pos, &offset);
558                 int ret2 = 0;
559
560                 if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
561                         ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
562                                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
563                         if (ret2)
564                                 goto iter_err;
565                         have_bucket_gens_key = false;
566                 }
567
568                 if (!have_bucket_gens_key) {
569                         bkey_bucket_gens_init(&g.k_i);
570                         g.k.p = pos;
571                         have_bucket_gens_key = true;
572                 }
573
574                 g.v.gens[offset] = gen;
575 iter_err:
576                 ret2;
577         }));
578
579         if (have_bucket_gens_key && !ret)
580                 ret = commit_do(trans, NULL, NULL,
581                                 BCH_TRANS_COMMIT_no_enospc,
582                         bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
583
584         bch2_trans_put(trans);
585
586         bch_err_fn(c, ret);
587         return ret;
588 }
589
590 int bch2_alloc_read(struct bch_fs *c)
591 {
592         struct btree_trans *trans = bch2_trans_get(c);
593         struct bch_dev *ca = NULL;
594         int ret;
595
596         if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
597                 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
598                                          BTREE_ITER_prefetch, k, ({
599                         u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
600                         u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
601
602                         if (k.k->type != KEY_TYPE_bucket_gens)
603                                 continue;
604
605                         ca = bch2_dev_iterate(c, ca, k.k->p.inode);
606                         /*
607                          * Not a fsck error because this is checked/repaired by
608                          * bch2_check_alloc_key() which runs later:
609                          */
610                         if (!ca) {
611                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
612                                 continue;
613                         }
614
615                         const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
616
617                         for (u64 b = max_t(u64, ca->mi.first_bucket, start);
618                              b < min_t(u64, ca->mi.nbuckets, end);
619                              b++)
620                                 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
621                         0;
622                 }));
623         } else {
624                 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
625                                          BTREE_ITER_prefetch, k, ({
626                         ca = bch2_dev_iterate(c, ca, k.k->p.inode);
627                         /*
628                          * Not a fsck error because this is checked/repaired by
629                          * bch2_check_alloc_key() which runs later:
630                          */
631                         if (!ca) {
632                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
633                                 continue;
634                         }
635
636                         if (k.k->p.offset < ca->mi.first_bucket) {
637                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
638                                 continue;
639                         }
640
641                         if (k.k->p.offset >= ca->mi.nbuckets) {
642                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
643                                 continue;
644                         }
645
646                         struct bch_alloc_v4 a;
647                         *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
648                         0;
649                 }));
650         }
651
652         bch2_dev_put(ca);
653         bch2_trans_put(trans);
654
655         bch_err_fn(c, ret);
656         return ret;
657 }
658
659 /* Free space/discard btree: */
660
661 static int __need_discard_or_freespace_err(struct btree_trans *trans,
662                                            struct bkey_s_c alloc_k,
663                                            bool set, bool discard, bool repair)
664 {
665         struct bch_fs *c = trans->c;
666         enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
667         enum bch_sb_error_id err_id = discard
668                 ? BCH_FSCK_ERR_need_discard_key_wrong
669                 : BCH_FSCK_ERR_freespace_key_wrong;
670         enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
671         struct printbuf buf = PRINTBUF;
672
673         bch2_bkey_val_to_text(&buf, c, alloc_k);
674
675         int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
676                                   "bucket incorrectly %sset in %s btree\n"
677                                   "  %s",
678                                   set ? "" : "un",
679                                   bch2_btree_id_str(btree),
680                                   buf.buf);
681         if (ret == -BCH_ERR_fsck_ignore ||
682             ret == -BCH_ERR_fsck_errors_not_fixed)
683                 ret = 0;
684
685         printbuf_exit(&buf);
686         return ret;
687 }
688
689 #define need_discard_or_freespace_err(...)              \
690         fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
691
692 #define need_discard_or_freespace_err_on(cond, ...)             \
693         (unlikely(cond) ?  need_discard_or_freespace_err(__VA_ARGS__) : false)
694
695 static int bch2_bucket_do_index(struct btree_trans *trans,
696                                 struct bch_dev *ca,
697                                 struct bkey_s_c alloc_k,
698                                 const struct bch_alloc_v4 *a,
699                                 bool set)
700 {
701         enum btree_id btree;
702         struct bpos pos;
703
704         if (a->data_type != BCH_DATA_free &&
705             a->data_type != BCH_DATA_need_discard)
706                 return 0;
707
708         switch (a->data_type) {
709         case BCH_DATA_free:
710                 btree = BTREE_ID_freespace;
711                 pos = alloc_freespace_pos(alloc_k.k->p, *a);
712                 break;
713         case BCH_DATA_need_discard:
714                 btree = BTREE_ID_need_discard;
715                 pos = alloc_k.k->p;
716                 break;
717         default:
718                 return 0;
719         }
720
721         struct btree_iter iter;
722         struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
723         int ret = bkey_err(old);
724         if (ret)
725                 return ret;
726
727         need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
728                                          !old.k->type != set,
729                                          trans, alloc_k, set,
730                                          btree == BTREE_ID_need_discard, false);
731
732         ret = bch2_btree_bit_mod_iter(trans, &iter, set);
733 fsck_err:
734         bch2_trans_iter_exit(trans, &iter);
735         return ret;
736 }
737
738 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
739                                            struct bpos bucket, u8 gen)
740 {
741         struct btree_iter iter;
742         unsigned offset;
743         struct bpos pos = alloc_gens_pos(bucket, &offset);
744         struct bkey_i_bucket_gens *g;
745         struct bkey_s_c k;
746         int ret;
747
748         g = bch2_trans_kmalloc(trans, sizeof(*g));
749         ret = PTR_ERR_OR_ZERO(g);
750         if (ret)
751                 return ret;
752
753         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
754                                BTREE_ITER_intent|
755                                BTREE_ITER_with_updates);
756         ret = bkey_err(k);
757         if (ret)
758                 return ret;
759
760         if (k.k->type != KEY_TYPE_bucket_gens) {
761                 bkey_bucket_gens_init(&g->k_i);
762                 g->k.p = iter.pos;
763         } else {
764                 bkey_reassemble(&g->k_i, k);
765         }
766
767         g->v.gens[offset] = gen;
768
769         ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
770         bch2_trans_iter_exit(trans, &iter);
771         return ret;
772 }
773
774 static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
775                                                     enum bch_data_type data_type,
776                                                     s64 delta_buckets,
777                                                     s64 delta_sectors,
778                                                     s64 delta_fragmented, unsigned flags)
779 {
780         struct disk_accounting_pos acc = {
781                 .type = BCH_DISK_ACCOUNTING_dev_data_type,
782                 .dev_data_type.dev              = ca->dev_idx,
783                 .dev_data_type.data_type        = data_type,
784         };
785         s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
786
787         return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
788 }
789
790 int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
791                                    const struct bch_alloc_v4 *old,
792                                    const struct bch_alloc_v4 *new,
793                                    unsigned flags)
794 {
795         s64 old_sectors = bch2_bucket_sectors(*old);
796         s64 new_sectors = bch2_bucket_sectors(*new);
797         if (old->data_type != new->data_type) {
798                 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
799                                  1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
800                           bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
801                                 -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
802                 if (ret)
803                         return ret;
804         } else if (old_sectors != new_sectors) {
805                 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
806                                          0,
807                                          new_sectors - old_sectors,
808                                          bch2_bucket_sectors_fragmented(ca, *new) -
809                                          bch2_bucket_sectors_fragmented(ca, *old), flags);
810                 if (ret)
811                         return ret;
812         }
813
814         s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
815         s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
816         if (old_unstriped != new_unstriped) {
817                 int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
818                                          !!new_unstriped - !!old_unstriped,
819                                          new_unstriped - old_unstriped,
820                                          0,
821                                          flags);
822                 if (ret)
823                         return ret;
824         }
825
826         return 0;
827 }
828
829 int bch2_trigger_alloc(struct btree_trans *trans,
830                        enum btree_id btree, unsigned level,
831                        struct bkey_s_c old, struct bkey_s new,
832                        enum btree_iter_update_trigger_flags flags)
833 {
834         struct bch_fs *c = trans->c;
835         struct printbuf buf = PRINTBUF;
836         int ret = 0;
837
838         struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
839         if (!ca)
840                 return -EIO;
841
842         struct bch_alloc_v4 old_a_convert;
843         const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
844
845         struct bch_alloc_v4 *new_a;
846         if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
847                 new_a = bkey_s_to_alloc_v4(new).v;
848         } else {
849                 BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
850
851                 struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
852                 ret = PTR_ERR_OR_ZERO(new_ka);
853                 if (unlikely(ret))
854                         goto err;
855                 new_a = &new_ka->v;
856         }
857
858         if (flags & BTREE_TRIGGER_transactional) {
859                 alloc_data_type_set(new_a, new_a->data_type);
860
861                 int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
862                                      (int) data_type_is_empty(old_a->data_type);
863
864                 if (is_empty_delta < 0) {
865                         new_a->io_time[READ] = bch2_current_io_time(c, READ);
866                         new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
867                         SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
868                         SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
869                 }
870
871                 if (data_type_is_empty(new_a->data_type) &&
872                     BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
873                     !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
874                         new_a->gen++;
875                         SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
876                         alloc_data_type_set(new_a, new_a->data_type);
877                 }
878
879                 if (old_a->data_type != new_a->data_type ||
880                     (new_a->data_type == BCH_DATA_free &&
881                      alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
882                         ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
883                                 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
884                         if (ret)
885                                 goto err;
886                 }
887
888                 if (new_a->data_type == BCH_DATA_cached &&
889                     !new_a->io_time[READ])
890                         new_a->io_time[READ] = bch2_current_io_time(c, READ);
891
892                 u64 old_lru = alloc_lru_idx_read(*old_a);
893                 u64 new_lru = alloc_lru_idx_read(*new_a);
894                 if (old_lru != new_lru) {
895                         ret = bch2_lru_change(trans, new.k->p.inode,
896                                               bucket_to_u64(new.k->p),
897                                               old_lru, new_lru);
898                         if (ret)
899                                 goto err;
900                 }
901
902                 old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
903                 new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
904                 if (old_lru != new_lru) {
905                         ret = bch2_lru_change(trans,
906                                         BCH_LRU_FRAGMENTATION_START,
907                                         bucket_to_u64(new.k->p),
908                                         old_lru, new_lru);
909                         if (ret)
910                                 goto err;
911                 }
912
913                 if (old_a->gen != new_a->gen) {
914                         ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
915                         if (ret)
916                                 goto err;
917                 }
918
919                 if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
920                     old_a->cached_sectors) {
921                         ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
922                                          -((s64) old_a->cached_sectors),
923                                          flags & BTREE_TRIGGER_gc);
924                         if (ret)
925                                 goto err;
926                 }
927
928                 ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
929                 if (ret)
930                         goto err;
931         }
932
933         if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
934                 u64 transaction_seq = trans->journal_res.seq;
935                 BUG_ON(!transaction_seq);
936
937                 if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
938                                     trans, alloc_key_journal_seq_in_future,
939                                     "bucket journal seq in future (currently at %llu)\n%s",
940                                     journal_cur_seq(&c->journal),
941                                     (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
942                         new_a->journal_seq_nonempty = transaction_seq;
943
944                 int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
945                                      (int) data_type_is_empty(old_a->data_type);
946
947                 /*
948                  * Record journal sequence number of empty -> nonempty transition:
949                  * Note that there may be multiple empty -> nonempty
950                  * transitions, data in a bucket may be overwritten while we're
951                  * still writing to it - so be careful to only record the first:
952                  * */
953                 if (is_empty_delta < 0 &&
954                     new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
955                         new_a->journal_seq_nonempty     = transaction_seq;
956                         new_a->journal_seq_empty        = 0;
957                 }
958
959                 /*
960                  * Bucket becomes empty: mark it as waiting for a journal flush,
961                  * unless updates since empty -> nonempty transition were never
962                  * flushed - we may need to ask the journal not to flush
963                  * intermediate sequence numbers:
964                  */
965                 if (is_empty_delta > 0) {
966                         if (new_a->journal_seq_nonempty == transaction_seq ||
967                             bch2_journal_noflush_seq(&c->journal,
968                                                      new_a->journal_seq_nonempty,
969                                                      transaction_seq)) {
970                                 new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
971                         } else {
972                                 new_a->journal_seq_empty = transaction_seq;
973
974                                 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
975                                                                            c->journal.flushed_seq_ondisk,
976                                                                            new.k->p.inode, new.k->p.offset,
977                                                                            transaction_seq);
978                                 if (bch2_fs_fatal_err_on(ret, c,
979                                                 "setting bucket_needs_journal_commit: %s",
980                                                 bch2_err_str(ret)))
981                                         goto err;
982                         }
983                 }
984
985                 if (new_a->gen != old_a->gen) {
986                         rcu_read_lock();
987                         u8 *gen = bucket_gen(ca, new.k->p.offset);
988                         if (unlikely(!gen)) {
989                                 rcu_read_unlock();
990                                 goto invalid_bucket;
991                         }
992                         *gen = new_a->gen;
993                         rcu_read_unlock();
994                 }
995
996 #define eval_state(_a, expr)            ({ const struct bch_alloc_v4 *a = _a; expr; })
997 #define statechange(expr)               !eval_state(old_a, expr) && eval_state(new_a, expr)
998 #define bucket_flushed(a)               (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
999
1000                 if (statechange(a->data_type == BCH_DATA_free) &&
1001                     bucket_flushed(new_a))
1002                         closure_wake_up(&c->freelist_wait);
1003
1004                 if (statechange(a->data_type == BCH_DATA_need_discard) &&
1005                     !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
1006                     bucket_flushed(new_a))
1007                         bch2_discard_one_bucket_fast(ca, new.k->p.offset);
1008
1009                 if (statechange(a->data_type == BCH_DATA_cached) &&
1010                     !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
1011                     should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
1012                         bch2_dev_do_invalidates(ca);
1013
1014                 if (statechange(a->data_type == BCH_DATA_need_gc_gens))
1015                         bch2_gc_gens_async(c);
1016         }
1017
1018         if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
1019                 rcu_read_lock();
1020                 struct bucket *g = gc_bucket(ca, new.k->p.offset);
1021                 if (unlikely(!g)) {
1022                         rcu_read_unlock();
1023                         goto invalid_bucket;
1024                 }
1025                 g->gen_valid    = 1;
1026                 g->gen          = new_a->gen;
1027                 rcu_read_unlock();
1028         }
1029 err:
1030 fsck_err:
1031         printbuf_exit(&buf);
1032         bch2_dev_put(ca);
1033         return ret;
1034 invalid_bucket:
1035         bch2_fs_inconsistent(c, "reference to invalid bucket\n  %s",
1036                              (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
1037         ret = -EIO;
1038         goto err;
1039 }
1040
1041 /*
1042  * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
1043  * extents style btrees, but works on non-extents btrees:
1044  */
1045 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
1046 {
1047         struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
1048
1049         if (bkey_err(k))
1050                 return k;
1051
1052         if (k.k->type) {
1053                 return k;
1054         } else {
1055                 struct btree_iter iter2;
1056                 struct bpos next;
1057
1058                 bch2_trans_copy_iter(&iter2, iter);
1059
1060                 struct btree_path *path = btree_iter_path(iter->trans, iter);
1061                 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
1062                         end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
1063
1064                 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
1065
1066                 /*
1067                  * btree node min/max is a closed interval, upto takes a half
1068                  * open interval:
1069                  */
1070                 k = bch2_btree_iter_peek_max(&iter2, end);
1071                 next = iter2.pos;
1072                 bch2_trans_iter_exit(iter->trans, &iter2);
1073
1074                 BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
1075
1076                 if (bkey_err(k))
1077                         return k;
1078
1079                 bkey_init(hole);
1080                 hole->p = iter->pos;
1081
1082                 bch2_key_resize(hole, next.offset - iter->pos.offset);
1083                 return (struct bkey_s_c) { hole, NULL };
1084         }
1085 }
1086
1087 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
1088 {
1089         if (*ca) {
1090                 if (bucket->offset < (*ca)->mi.first_bucket)
1091                         bucket->offset = (*ca)->mi.first_bucket;
1092
1093                 if (bucket->offset < (*ca)->mi.nbuckets)
1094                         return true;
1095
1096                 bch2_dev_put(*ca);
1097                 *ca = NULL;
1098                 bucket->inode++;
1099                 bucket->offset = 0;
1100         }
1101
1102         rcu_read_lock();
1103         *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
1104         if (*ca) {
1105                 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1106                 bch2_dev_get(*ca);
1107         }
1108         rcu_read_unlock();
1109
1110         return *ca != NULL;
1111 }
1112
1113 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
1114                                         struct bch_dev **ca, struct bkey *hole)
1115 {
1116         struct bch_fs *c = iter->trans->c;
1117         struct bkey_s_c k;
1118 again:
1119         k = bch2_get_key_or_hole(iter, POS_MAX, hole);
1120         if (bkey_err(k))
1121                 return k;
1122
1123         *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1124
1125         if (!k.k->type) {
1126                 struct bpos hole_start = bkey_start_pos(k.k);
1127
1128                 if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1129                         if (!next_bucket(c, ca, &hole_start))
1130                                 return bkey_s_c_null;
1131
1132                         bch2_btree_iter_set_pos(iter, hole_start);
1133                         goto again;
1134                 }
1135
1136                 if (k.k->p.offset > (*ca)->mi.nbuckets)
1137                         bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1138         }
1139
1140         return k;
1141 }
1142
1143 static noinline_for_stack
1144 int bch2_check_alloc_key(struct btree_trans *trans,
1145                          struct bkey_s_c alloc_k,
1146                          struct btree_iter *alloc_iter,
1147                          struct btree_iter *discard_iter,
1148                          struct btree_iter *freespace_iter,
1149                          struct btree_iter *bucket_gens_iter)
1150 {
1151         struct bch_fs *c = trans->c;
1152         struct bch_alloc_v4 a_convert;
1153         const struct bch_alloc_v4 *a;
1154         unsigned gens_offset;
1155         struct bkey_s_c k;
1156         struct printbuf buf = PRINTBUF;
1157         int ret = 0;
1158
1159         struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1160         if (fsck_err_on(!ca,
1161                         trans, alloc_key_to_missing_dev_bucket,
1162                         "alloc key for invalid device:bucket %llu:%llu",
1163                         alloc_k.k->p.inode, alloc_k.k->p.offset))
1164                 ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1165         if (!ca)
1166                 return ret;
1167
1168         if (!ca->mi.freespace_initialized)
1169                 goto out;
1170
1171         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1172
1173         bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1174         k = bch2_btree_iter_peek_slot(discard_iter);
1175         ret = bkey_err(k);
1176         if (ret)
1177                 goto err;
1178
1179         bool is_discarded = a->data_type == BCH_DATA_need_discard;
1180         if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
1181                                              trans, alloc_k, !is_discarded, true, true)) {
1182                 ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
1183                 if (ret)
1184                         goto err;
1185         }
1186
1187         bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1188         k = bch2_btree_iter_peek_slot(freespace_iter);
1189         ret = bkey_err(k);
1190         if (ret)
1191                 goto err;
1192
1193         bool is_free = a->data_type == BCH_DATA_free;
1194         if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
1195                                              trans, alloc_k, !is_free, false, true)) {
1196                 ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
1197                 if (ret)
1198                         goto err;
1199         }
1200
1201         bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1202         k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1203         ret = bkey_err(k);
1204         if (ret)
1205                 goto err;
1206
1207         if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1208                         trans, bucket_gens_key_wrong,
1209                         "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1210                         "  %s",
1211                         alloc_gen(k, gens_offset), a->gen,
1212                         (printbuf_reset(&buf),
1213                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1214                 struct bkey_i_bucket_gens *g =
1215                         bch2_trans_kmalloc(trans, sizeof(*g));
1216
1217                 ret = PTR_ERR_OR_ZERO(g);
1218                 if (ret)
1219                         goto err;
1220
1221                 if (k.k->type == KEY_TYPE_bucket_gens) {
1222                         bkey_reassemble(&g->k_i, k);
1223                 } else {
1224                         bkey_bucket_gens_init(&g->k_i);
1225                         g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1226                 }
1227
1228                 g->v.gens[gens_offset] = a->gen;
1229
1230                 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1231                 if (ret)
1232                         goto err;
1233         }
1234 out:
1235 err:
1236 fsck_err:
1237         bch2_dev_put(ca);
1238         printbuf_exit(&buf);
1239         return ret;
1240 }
1241
1242 static noinline_for_stack
1243 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1244                                     struct bch_dev *ca,
1245                                     struct bpos start,
1246                                     struct bpos *end,
1247                                     struct btree_iter *freespace_iter)
1248 {
1249         struct bkey_s_c k;
1250         struct printbuf buf = PRINTBUF;
1251         int ret;
1252
1253         if (!ca->mi.freespace_initialized)
1254                 return 0;
1255
1256         bch2_btree_iter_set_pos(freespace_iter, start);
1257
1258         k = bch2_btree_iter_peek_slot(freespace_iter);
1259         ret = bkey_err(k);
1260         if (ret)
1261                 goto err;
1262
1263         *end = bkey_min(k.k->p, *end);
1264
1265         if (fsck_err_on(k.k->type != KEY_TYPE_set,
1266                         trans, freespace_hole_missing,
1267                         "hole in alloc btree missing in freespace btree\n"
1268                         "  device %llu buckets %llu-%llu",
1269                         freespace_iter->pos.inode,
1270                         freespace_iter->pos.offset,
1271                         end->offset)) {
1272                 struct bkey_i *update =
1273                         bch2_trans_kmalloc(trans, sizeof(*update));
1274
1275                 ret = PTR_ERR_OR_ZERO(update);
1276                 if (ret)
1277                         goto err;
1278
1279                 bkey_init(&update->k);
1280                 update->k.type  = KEY_TYPE_set;
1281                 update->k.p     = freespace_iter->pos;
1282                 bch2_key_resize(&update->k,
1283                                 min_t(u64, U32_MAX, end->offset -
1284                                       freespace_iter->pos.offset));
1285
1286                 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1287                 if (ret)
1288                         goto err;
1289         }
1290 err:
1291 fsck_err:
1292         printbuf_exit(&buf);
1293         return ret;
1294 }
1295
1296 static noinline_for_stack
1297 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1298                                       struct bpos start,
1299                                       struct bpos *end,
1300                                       struct btree_iter *bucket_gens_iter)
1301 {
1302         struct bkey_s_c k;
1303         struct printbuf buf = PRINTBUF;
1304         unsigned i, gens_offset, gens_end_offset;
1305         int ret;
1306
1307         bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1308
1309         k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1310         ret = bkey_err(k);
1311         if (ret)
1312                 goto err;
1313
1314         if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1315                      alloc_gens_pos(*end,  &gens_end_offset)))
1316                 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1317
1318         if (k.k->type == KEY_TYPE_bucket_gens) {
1319                 struct bkey_i_bucket_gens g;
1320                 bool need_update = false;
1321
1322                 bkey_reassemble(&g.k_i, k);
1323
1324                 for (i = gens_offset; i < gens_end_offset; i++) {
1325                         if (fsck_err_on(g.v.gens[i], trans,
1326                                         bucket_gens_hole_wrong,
1327                                         "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1328                                         bucket_gens_pos_to_alloc(k.k->p, i).inode,
1329                                         bucket_gens_pos_to_alloc(k.k->p, i).offset,
1330                                         g.v.gens[i])) {
1331                                 g.v.gens[i] = 0;
1332                                 need_update = true;
1333                         }
1334                 }
1335
1336                 if (need_update) {
1337                         struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1338
1339                         ret = PTR_ERR_OR_ZERO(u);
1340                         if (ret)
1341                                 goto err;
1342
1343                         memcpy(u, &g, sizeof(g));
1344
1345                         ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1346                         if (ret)
1347                                 goto err;
1348                 }
1349         }
1350
1351         *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1352 err:
1353 fsck_err:
1354         printbuf_exit(&buf);
1355         return ret;
1356 }
1357
1358 struct check_discard_freespace_key_async {
1359         struct work_struct      work;
1360         struct bch_fs           *c;
1361         struct bbpos            pos;
1362 };
1363
1364 static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
1365 {
1366         struct btree_iter iter;
1367         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
1368         int ret = bkey_err(k);
1369         if (ret)
1370                 return ret;
1371
1372         u8 gen;
1373         ret = k.k->type != KEY_TYPE_set
1374                 ? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
1375                 : 0;
1376         bch2_trans_iter_exit(trans, &iter);
1377         return ret;
1378 }
1379
1380 static void check_discard_freespace_key_work(struct work_struct *work)
1381 {
1382         struct check_discard_freespace_key_async *w =
1383                 container_of(work, struct check_discard_freespace_key_async, work);
1384
1385         bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
1386         bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
1387         kfree(w);
1388 }
1389
1390 int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
1391                                      bool async_repair)
1392 {
1393         struct bch_fs *c = trans->c;
1394         enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1395                 ? BCH_DATA_need_discard
1396                 : BCH_DATA_free;
1397         struct printbuf buf = PRINTBUF;
1398
1399         struct bpos bucket = iter->pos;
1400         bucket.offset &= ~(~0ULL << 56);
1401         u64 genbits = iter->pos.offset & (~0ULL << 56);
1402
1403         struct btree_iter alloc_iter;
1404         struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
1405                                                      BTREE_ID_alloc, bucket,
1406                                                      async_repair ? BTREE_ITER_cached : 0);
1407         int ret = bkey_err(alloc_k);
1408         if (ret)
1409                 return ret;
1410
1411         if (!bch2_dev_bucket_exists(c, bucket)) {
1412                 if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
1413                              "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1414                              bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
1415                         goto delete;
1416                 ret = 1;
1417                 goto out;
1418         }
1419
1420         struct bch_alloc_v4 a_convert;
1421         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
1422
1423         if (a->data_type != state ||
1424             (state == BCH_DATA_free &&
1425              genbits != alloc_freespace_genbits(*a))) {
1426                 if (fsck_err(trans, need_discard_freespace_key_bad,
1427                              "%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1428                              (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1429                              bch2_btree_id_str(iter->btree_id),
1430                              iter->pos.inode,
1431                              iter->pos.offset,
1432                              a->data_type == state,
1433                              genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1434                         goto delete;
1435                 ret = 1;
1436                 goto out;
1437         }
1438
1439         *gen = a->gen;
1440 out:
1441 fsck_err:
1442         bch2_set_btree_iter_dontneed(&alloc_iter);
1443         bch2_trans_iter_exit(trans, &alloc_iter);
1444         printbuf_exit(&buf);
1445         return ret;
1446 delete:
1447         if (!async_repair) {
1448                 ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
1449                         bch2_trans_commit(trans, NULL, NULL,
1450                                 BCH_TRANS_COMMIT_no_enospc) ?:
1451                         -BCH_ERR_transaction_restart_commit;
1452                 goto out;
1453         } else {
1454                 /*
1455                  * We can't repair here when called from the allocator path: the
1456                  * commit will recurse back into the allocator
1457                  */
1458                 struct check_discard_freespace_key_async *w =
1459                         kzalloc(sizeof(*w), GFP_KERNEL);
1460                 if (!w)
1461                         goto out;
1462
1463                 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
1464                         kfree(w);
1465                         goto out;
1466                 }
1467
1468                 INIT_WORK(&w->work, check_discard_freespace_key_work);
1469                 w->c = c;
1470                 w->pos = BBPOS(iter->btree_id, iter->pos);
1471                 queue_work(c->write_ref_wq, &w->work);
1472                 goto out;
1473         }
1474 }
1475
1476 static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
1477 {
1478         u8 gen;
1479         int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
1480         return ret < 0 ? ret : 0;
1481 }
1482
1483 /*
1484  * We've already checked that generation numbers in the bucket_gens btree are
1485  * valid for buckets that exist; this just checks for keys for nonexistent
1486  * buckets.
1487  */
1488 static noinline_for_stack
1489 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1490                                struct btree_iter *iter,
1491                                struct bkey_s_c k)
1492 {
1493         struct bch_fs *c = trans->c;
1494         struct bkey_i_bucket_gens g;
1495         u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1496         u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1497         u64 b;
1498         bool need_update = false;
1499         struct printbuf buf = PRINTBUF;
1500         int ret = 0;
1501
1502         BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1503         bkey_reassemble(&g.k_i, k);
1504
1505         struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1506         if (!ca) {
1507                 if (fsck_err(trans, bucket_gens_to_invalid_dev,
1508                              "bucket_gens key for invalid device:\n  %s",
1509                              (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1510                         ret = bch2_btree_delete_at(trans, iter, 0);
1511                 goto out;
1512         }
1513
1514         if (fsck_err_on(end <= ca->mi.first_bucket ||
1515                         start >= ca->mi.nbuckets,
1516                         trans, bucket_gens_to_invalid_buckets,
1517                         "bucket_gens key for invalid buckets:\n  %s",
1518                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1519                 ret = bch2_btree_delete_at(trans, iter, 0);
1520                 goto out;
1521         }
1522
1523         for (b = start; b < ca->mi.first_bucket; b++)
1524                 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1525                                 trans, bucket_gens_nonzero_for_invalid_buckets,
1526                                 "bucket_gens key has nonzero gen for invalid bucket")) {
1527                         g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1528                         need_update = true;
1529                 }
1530
1531         for (b = ca->mi.nbuckets; b < end; b++)
1532                 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1533                                 trans, bucket_gens_nonzero_for_invalid_buckets,
1534                                 "bucket_gens key has nonzero gen for invalid bucket")) {
1535                         g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1536                         need_update = true;
1537                 }
1538
1539         if (need_update) {
1540                 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1541
1542                 ret = PTR_ERR_OR_ZERO(u);
1543                 if (ret)
1544                         goto out;
1545
1546                 memcpy(u, &g, sizeof(g));
1547                 ret = bch2_trans_update(trans, iter, u, 0);
1548         }
1549 out:
1550 fsck_err:
1551         bch2_dev_put(ca);
1552         printbuf_exit(&buf);
1553         return ret;
1554 }
1555
1556 int bch2_check_alloc_info(struct bch_fs *c)
1557 {
1558         struct btree_trans *trans = bch2_trans_get(c);
1559         struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1560         struct bch_dev *ca = NULL;
1561         struct bkey hole;
1562         struct bkey_s_c k;
1563         int ret = 0;
1564
1565         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1566                              BTREE_ITER_prefetch);
1567         bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1568                              BTREE_ITER_prefetch);
1569         bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1570                              BTREE_ITER_prefetch);
1571         bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1572                              BTREE_ITER_prefetch);
1573
1574         while (1) {
1575                 struct bpos next;
1576
1577                 bch2_trans_begin(trans);
1578
1579                 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
1580                 ret = bkey_err(k);
1581                 if (ret)
1582                         goto bkey_err;
1583
1584                 if (!k.k)
1585                         break;
1586
1587                 if (k.k->type) {
1588                         next = bpos_nosnap_successor(k.k->p);
1589
1590                         ret = bch2_check_alloc_key(trans,
1591                                                    k, &iter,
1592                                                    &discard_iter,
1593                                                    &freespace_iter,
1594                                                    &bucket_gens_iter);
1595                         if (ret)
1596                                 goto bkey_err;
1597                 } else {
1598                         next = k.k->p;
1599
1600                         ret = bch2_check_alloc_hole_freespace(trans, ca,
1601                                                     bkey_start_pos(k.k),
1602                                                     &next,
1603                                                     &freespace_iter) ?:
1604                                 bch2_check_alloc_hole_bucket_gens(trans,
1605                                                     bkey_start_pos(k.k),
1606                                                     &next,
1607                                                     &bucket_gens_iter);
1608                         if (ret)
1609                                 goto bkey_err;
1610                 }
1611
1612                 ret = bch2_trans_commit(trans, NULL, NULL,
1613                                         BCH_TRANS_COMMIT_no_enospc);
1614                 if (ret)
1615                         goto bkey_err;
1616
1617                 bch2_btree_iter_set_pos(&iter, next);
1618 bkey_err:
1619                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1620                         continue;
1621                 if (ret)
1622                         break;
1623         }
1624         bch2_trans_iter_exit(trans, &bucket_gens_iter);
1625         bch2_trans_iter_exit(trans, &freespace_iter);
1626         bch2_trans_iter_exit(trans, &discard_iter);
1627         bch2_trans_iter_exit(trans, &iter);
1628         bch2_dev_put(ca);
1629         ca = NULL;
1630
1631         if (ret < 0)
1632                 goto err;
1633
1634         ret = for_each_btree_key(trans, iter,
1635                         BTREE_ID_need_discard, POS_MIN,
1636                         BTREE_ITER_prefetch, k,
1637                 bch2_check_discard_freespace_key_fsck(trans, &iter));
1638         if (ret)
1639                 goto err;
1640
1641         bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1642                              BTREE_ITER_prefetch);
1643         while (1) {
1644                 bch2_trans_begin(trans);
1645                 k = bch2_btree_iter_peek(&iter);
1646                 if (!k.k)
1647                         break;
1648
1649                 ret = bkey_err(k) ?:
1650                         bch2_check_discard_freespace_key_fsck(trans, &iter);
1651                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1652                         ret = 0;
1653                         continue;
1654                 }
1655                 if (ret) {
1656                         struct printbuf buf = PRINTBUF;
1657                         bch2_bkey_val_to_text(&buf, c, k);
1658
1659                         bch_err(c, "while checking %s", buf.buf);
1660                         printbuf_exit(&buf);
1661                         break;
1662                 }
1663
1664                 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
1665         }
1666         bch2_trans_iter_exit(trans, &iter);
1667         if (ret)
1668                 goto err;
1669
1670         ret = for_each_btree_key_commit(trans, iter,
1671                         BTREE_ID_bucket_gens, POS_MIN,
1672                         BTREE_ITER_prefetch, k,
1673                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1674                 bch2_check_bucket_gens_key(trans, &iter, k));
1675 err:
1676         bch2_trans_put(trans);
1677         bch_err_fn(c, ret);
1678         return ret;
1679 }
1680
1681 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1682                                        struct btree_iter *alloc_iter,
1683                                        struct bkey_buf *last_flushed)
1684 {
1685         struct bch_fs *c = trans->c;
1686         struct bch_alloc_v4 a_convert;
1687         const struct bch_alloc_v4 *a;
1688         struct bkey_s_c alloc_k;
1689         struct printbuf buf = PRINTBUF;
1690         int ret;
1691
1692         alloc_k = bch2_btree_iter_peek(alloc_iter);
1693         if (!alloc_k.k)
1694                 return 0;
1695
1696         ret = bkey_err(alloc_k);
1697         if (ret)
1698                 return ret;
1699
1700         struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
1701         if (!ca)
1702                 return 0;
1703
1704         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1705
1706         u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
1707         if (lru_idx) {
1708                 ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
1709                                          lru_idx, alloc_k, last_flushed);
1710                 if (ret)
1711                         goto err;
1712         }
1713
1714         if (a->data_type != BCH_DATA_cached)
1715                 goto err;
1716
1717         if (fsck_err_on(!a->io_time[READ],
1718                         trans, alloc_key_cached_but_read_time_zero,
1719                         "cached bucket with read_time 0\n"
1720                         "  %s",
1721                 (printbuf_reset(&buf),
1722                  bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1723                 struct bkey_i_alloc_v4 *a_mut =
1724                         bch2_alloc_to_v4_mut(trans, alloc_k);
1725                 ret = PTR_ERR_OR_ZERO(a_mut);
1726                 if (ret)
1727                         goto err;
1728
1729                 a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
1730                 ret = bch2_trans_update(trans, alloc_iter,
1731                                         &a_mut->k_i, BTREE_TRIGGER_norun);
1732                 if (ret)
1733                         goto err;
1734
1735                 a = &a_mut->v;
1736         }
1737
1738         ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
1739                                  alloc_k, last_flushed);
1740         if (ret)
1741                 goto err;
1742 err:
1743 fsck_err:
1744         bch2_dev_put(ca);
1745         printbuf_exit(&buf);
1746         return ret;
1747 }
1748
1749 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1750 {
1751         struct bkey_buf last_flushed;
1752
1753         bch2_bkey_buf_init(&last_flushed);
1754         bkey_init(&last_flushed.k->k);
1755
1756         int ret = bch2_trans_run(c,
1757                 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1758                                 POS_MIN, BTREE_ITER_prefetch, k,
1759                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1760                         bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
1761
1762         bch2_bkey_buf_exit(&last_flushed, c);
1763         bch_err_fn(c, ret);
1764         return ret;
1765 }
1766
1767 static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
1768 {
1769         int ret;
1770
1771         mutex_lock(&ca->discard_buckets_in_flight_lock);
1772         darray_for_each(ca->discard_buckets_in_flight, i)
1773                 if (i->bucket == bucket) {
1774                         ret = -BCH_ERR_EEXIST_discard_in_flight_add;
1775                         goto out;
1776                 }
1777
1778         ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
1779                            .in_progress = in_progress,
1780                            .bucket      = bucket,
1781         }));
1782 out:
1783         mutex_unlock(&ca->discard_buckets_in_flight_lock);
1784         return ret;
1785 }
1786
1787 static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
1788 {
1789         mutex_lock(&ca->discard_buckets_in_flight_lock);
1790         darray_for_each(ca->discard_buckets_in_flight, i)
1791                 if (i->bucket == bucket) {
1792                         BUG_ON(!i->in_progress);
1793                         darray_remove_item(&ca->discard_buckets_in_flight, i);
1794                         goto found;
1795                 }
1796         BUG();
1797 found:
1798         mutex_unlock(&ca->discard_buckets_in_flight_lock);
1799 }
1800
1801 struct discard_buckets_state {
1802         u64             seen;
1803         u64             open;
1804         u64             need_journal_commit;
1805         u64             discarded;
1806 };
1807
1808 static int bch2_discard_one_bucket(struct btree_trans *trans,
1809                                    struct bch_dev *ca,
1810                                    struct btree_iter *need_discard_iter,
1811                                    struct bpos *discard_pos_done,
1812                                    struct discard_buckets_state *s,
1813                                    bool fastpath)
1814 {
1815         struct bch_fs *c = trans->c;
1816         struct bpos pos = need_discard_iter->pos;
1817         struct btree_iter iter = { NULL };
1818         struct bkey_s_c k;
1819         struct bkey_i_alloc_v4 *a;
1820         struct printbuf buf = PRINTBUF;
1821         bool discard_locked = false;
1822         int ret = 0;
1823
1824         if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1825                 s->open++;
1826                 goto out;
1827         }
1828
1829         u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
1830                                                       pos.inode, pos.offset);
1831         if (seq_ready > c->journal.flushed_seq_ondisk) {
1832                 if (seq_ready > c->journal.flushing_seq)
1833                         s->need_journal_commit++;
1834                 goto out;
1835         }
1836
1837         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1838                                need_discard_iter->pos,
1839                                BTREE_ITER_cached);
1840         ret = bkey_err(k);
1841         if (ret)
1842                 goto out;
1843
1844         a = bch2_alloc_to_v4_mut(trans, k);
1845         ret = PTR_ERR_OR_ZERO(a);
1846         if (ret)
1847                 goto out;
1848
1849         if (a->v.data_type != BCH_DATA_need_discard) {
1850                 if (need_discard_or_freespace_err(trans, k, true, true, true)) {
1851                         ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
1852                         if (ret)
1853                                 goto out;
1854                         goto commit;
1855                 }
1856
1857                 goto out;
1858         }
1859
1860         if (!fastpath) {
1861                 if (discard_in_flight_add(ca, iter.pos.offset, true))
1862                         goto out;
1863
1864                 discard_locked = true;
1865         }
1866
1867         if (!bkey_eq(*discard_pos_done, iter.pos)) {
1868                 s->discarded++;
1869                 *discard_pos_done = iter.pos;
1870
1871                 if (ca->mi.discard && !c->opts.nochanges) {
1872                         /*
1873                          * This works without any other locks because this is the only
1874                          * thread that removes items from the need_discard tree
1875                          */
1876                         bch2_trans_unlock_long(trans);
1877                         blkdev_issue_discard(ca->disk_sb.bdev,
1878                                              k.k->p.offset * ca->mi.bucket_size,
1879                                              ca->mi.bucket_size,
1880                                              GFP_KERNEL);
1881                         ret = bch2_trans_relock_notrace(trans);
1882                         if (ret)
1883                                 goto out;
1884                 }
1885         }
1886
1887         SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1888         alloc_data_type_set(&a->v, a->v.data_type);
1889
1890         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1891         if (ret)
1892                 goto out;
1893 commit:
1894         ret = bch2_trans_commit(trans, NULL, NULL,
1895                                 BCH_WATERMARK_btree|
1896                                 BCH_TRANS_COMMIT_no_enospc);
1897         if (ret)
1898                 goto out;
1899
1900         count_event(c, bucket_discard);
1901 out:
1902 fsck_err:
1903         if (discard_locked)
1904                 discard_in_flight_remove(ca, iter.pos.offset);
1905         if (!ret)
1906                 s->seen++;
1907         bch2_trans_iter_exit(trans, &iter);
1908         printbuf_exit(&buf);
1909         return ret;
1910 }
1911
1912 static void bch2_do_discards_work(struct work_struct *work)
1913 {
1914         struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
1915         struct bch_fs *c = ca->fs;
1916         struct discard_buckets_state s = {};
1917         struct bpos discard_pos_done = POS_MAX;
1918         int ret;
1919
1920         /*
1921          * We're doing the commit in bch2_discard_one_bucket instead of using
1922          * for_each_btree_key_commit() so that we can increment counters after
1923          * successful commit:
1924          */
1925         ret = bch2_trans_run(c,
1926                 for_each_btree_key_max(trans, iter,
1927                                    BTREE_ID_need_discard,
1928                                    POS(ca->dev_idx, 0),
1929                                    POS(ca->dev_idx, U64_MAX), 0, k,
1930                         bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
1931
1932         if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
1933                 bch2_journal_flush_async(&c->journal, NULL);
1934
1935         trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1936                               bch2_err_str(ret));
1937
1938         percpu_ref_put(&ca->io_ref);
1939         bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1940 }
1941
1942 void bch2_dev_do_discards(struct bch_dev *ca)
1943 {
1944         struct bch_fs *c = ca->fs;
1945
1946         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
1947                 return;
1948
1949         if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1950                 goto put_write_ref;
1951
1952         if (queue_work(c->write_ref_wq, &ca->discard_work))
1953                 return;
1954
1955         percpu_ref_put(&ca->io_ref);
1956 put_write_ref:
1957         bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1958 }
1959
1960 void bch2_do_discards(struct bch_fs *c)
1961 {
1962         for_each_member_device(c, ca)
1963                 bch2_dev_do_discards(ca);
1964 }
1965
1966 static int bch2_do_discards_fast_one(struct btree_trans *trans,
1967                                      struct bch_dev *ca,
1968                                      u64 bucket,
1969                                      struct bpos *discard_pos_done,
1970                                      struct discard_buckets_state *s)
1971 {
1972         struct btree_iter need_discard_iter;
1973         struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
1974                                         BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
1975         int ret = bkey_err(discard_k);
1976         if (ret)
1977                 return ret;
1978
1979         if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
1980                             trans, discarding_bucket_not_in_need_discard_btree,
1981                             "attempting to discard bucket %u:%llu not in need_discard btree",
1982                             ca->dev_idx, bucket))
1983                 goto out;
1984
1985         ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
1986 out:
1987 fsck_err:
1988         bch2_trans_iter_exit(trans, &need_discard_iter);
1989         return ret;
1990 }
1991
1992 static void bch2_do_discards_fast_work(struct work_struct *work)
1993 {
1994         struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
1995         struct bch_fs *c = ca->fs;
1996         struct discard_buckets_state s = {};
1997         struct bpos discard_pos_done = POS_MAX;
1998         struct btree_trans *trans = bch2_trans_get(c);
1999         int ret = 0;
2000
2001         while (1) {
2002                 bool got_bucket = false;
2003                 u64 bucket;
2004
2005                 mutex_lock(&ca->discard_buckets_in_flight_lock);
2006                 darray_for_each(ca->discard_buckets_in_flight, i) {
2007                         if (i->in_progress)
2008                                 continue;
2009
2010                         got_bucket = true;
2011                         bucket = i->bucket;
2012                         i->in_progress = true;
2013                         break;
2014                 }
2015                 mutex_unlock(&ca->discard_buckets_in_flight_lock);
2016
2017                 if (!got_bucket)
2018                         break;
2019
2020                 ret = lockrestart_do(trans,
2021                         bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
2022                 bch_err_fn(c, ret);
2023
2024                 discard_in_flight_remove(ca, bucket);
2025
2026                 if (ret)
2027                         break;
2028         }
2029
2030         trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
2031
2032         bch2_trans_put(trans);
2033         percpu_ref_put(&ca->io_ref);
2034         bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
2035 }
2036
2037 static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
2038 {
2039         struct bch_fs *c = ca->fs;
2040
2041         if (discard_in_flight_add(ca, bucket, false))
2042                 return;
2043
2044         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
2045                 return;
2046
2047         if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
2048                 goto put_ref;
2049
2050         if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
2051                 return;
2052
2053         percpu_ref_put(&ca->io_ref);
2054 put_ref:
2055         bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
2056 }
2057
2058 static int invalidate_one_bucket(struct btree_trans *trans,
2059                                  struct btree_iter *lru_iter,
2060                                  struct bkey_s_c lru_k,
2061                                  s64 *nr_to_invalidate)
2062 {
2063         struct bch_fs *c = trans->c;
2064         struct bkey_i_alloc_v4 *a = NULL;
2065         struct printbuf buf = PRINTBUF;
2066         struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
2067         unsigned cached_sectors;
2068         int ret = 0;
2069
2070         if (*nr_to_invalidate <= 0)
2071                 return 1;
2072
2073         if (!bch2_dev_bucket_exists(c, bucket)) {
2074                 if (fsck_err(trans, lru_entry_to_invalid_bucket,
2075                              "lru key points to nonexistent device:bucket %llu:%llu",
2076                              bucket.inode, bucket.offset))
2077                         return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
2078                 goto out;
2079         }
2080
2081         if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
2082                 return 0;
2083
2084         a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
2085         ret = PTR_ERR_OR_ZERO(a);
2086         if (ret)
2087                 goto out;
2088
2089         /* We expect harmless races here due to the btree write buffer: */
2090         if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
2091                 goto out;
2092
2093         BUG_ON(a->v.data_type != BCH_DATA_cached);
2094         BUG_ON(a->v.dirty_sectors);
2095
2096         if (!a->v.cached_sectors)
2097                 bch_err(c, "invalidating empty bucket, confused");
2098
2099         cached_sectors = a->v.cached_sectors;
2100
2101         SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
2102         a->v.gen++;
2103         a->v.data_type          = 0;
2104         a->v.dirty_sectors      = 0;
2105         a->v.stripe_sectors     = 0;
2106         a->v.cached_sectors     = 0;
2107         a->v.io_time[READ]      = bch2_current_io_time(c, READ);
2108         a->v.io_time[WRITE]     = bch2_current_io_time(c, WRITE);
2109
2110         ret = bch2_trans_commit(trans, NULL, NULL,
2111                                 BCH_WATERMARK_btree|
2112                                 BCH_TRANS_COMMIT_no_enospc);
2113         if (ret)
2114                 goto out;
2115
2116         trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
2117         --*nr_to_invalidate;
2118 out:
2119 fsck_err:
2120         printbuf_exit(&buf);
2121         return ret;
2122 }
2123
2124 static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
2125                                     struct bch_dev *ca, bool *wrapped)
2126 {
2127         struct bkey_s_c k;
2128 again:
2129         k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
2130         if (!k.k && !*wrapped) {
2131                 bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
2132                 *wrapped = true;
2133                 goto again;
2134         }
2135
2136         return k;
2137 }
2138
2139 static void bch2_do_invalidates_work(struct work_struct *work)
2140 {
2141         struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
2142         struct bch_fs *c = ca->fs;
2143         struct btree_trans *trans = bch2_trans_get(c);
2144         int ret = 0;
2145
2146         ret = bch2_btree_write_buffer_tryflush(trans);
2147         if (ret)
2148                 goto err;
2149
2150         s64 nr_to_invalidate =
2151                 should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2152         struct btree_iter iter;
2153         bool wrapped = false;
2154
2155         bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
2156                              lru_pos(ca->dev_idx, 0,
2157                                      ((bch2_current_io_time(c, READ) + U32_MAX) &
2158                                       LRU_TIME_MAX)), 0);
2159
2160         while (true) {
2161                 bch2_trans_begin(trans);
2162
2163                 struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
2164                 ret = bkey_err(k);
2165                 if (ret)
2166                         goto restart_err;
2167                 if (!k.k)
2168                         break;
2169
2170                 ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
2171 restart_err:
2172                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2173                         continue;
2174                 if (ret)
2175                         break;
2176
2177                 bch2_btree_iter_advance(&iter);
2178         }
2179         bch2_trans_iter_exit(trans, &iter);
2180 err:
2181         bch2_trans_put(trans);
2182         percpu_ref_put(&ca->io_ref);
2183         bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2184 }
2185
2186 void bch2_dev_do_invalidates(struct bch_dev *ca)
2187 {
2188         struct bch_fs *c = ca->fs;
2189
2190         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
2191                 return;
2192
2193         if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
2194                 goto put_ref;
2195
2196         if (queue_work(c->write_ref_wq, &ca->invalidate_work))
2197                 return;
2198
2199         percpu_ref_put(&ca->io_ref);
2200 put_ref:
2201         bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2202 }
2203
2204 void bch2_do_invalidates(struct bch_fs *c)
2205 {
2206         for_each_member_device(c, ca)
2207                 bch2_dev_do_invalidates(ca);
2208 }
2209
2210 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2211                             u64 bucket_start, u64 bucket_end)
2212 {
2213         struct btree_trans *trans = bch2_trans_get(c);
2214         struct btree_iter iter;
2215         struct bkey_s_c k;
2216         struct bkey hole;
2217         struct bpos end = POS(ca->dev_idx, bucket_end);
2218         struct bch_member *m;
2219         unsigned long last_updated = jiffies;
2220         int ret;
2221
2222         BUG_ON(bucket_start > bucket_end);
2223         BUG_ON(bucket_end > ca->mi.nbuckets);
2224
2225         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2226                 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2227                 BTREE_ITER_prefetch);
2228         /*
2229          * Scan the alloc btree for every bucket on @ca, and add buckets to the
2230          * freespace/need_discard/need_gc_gens btrees as needed:
2231          */
2232         while (1) {
2233                 if (time_after(jiffies, last_updated + HZ * 10)) {
2234                         bch_info(ca, "%s: currently at %llu/%llu",
2235                                  __func__, iter.pos.offset, ca->mi.nbuckets);
2236                         last_updated = jiffies;
2237                 }
2238
2239                 bch2_trans_begin(trans);
2240
2241                 if (bkey_ge(iter.pos, end)) {
2242                         ret = 0;
2243                         break;
2244                 }
2245
2246                 k = bch2_get_key_or_hole(&iter, end, &hole);
2247                 ret = bkey_err(k);
2248                 if (ret)
2249                         goto bkey_err;
2250
2251                 if (k.k->type) {
2252                         /*
2253                          * We process live keys in the alloc btree one at a
2254                          * time:
2255                          */
2256                         struct bch_alloc_v4 a_convert;
2257                         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2258
2259                         ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
2260                                 bch2_trans_commit(trans, NULL, NULL,
2261                                                   BCH_TRANS_COMMIT_no_enospc);
2262                         if (ret)
2263                                 goto bkey_err;
2264
2265                         bch2_btree_iter_advance(&iter);
2266                 } else {
2267                         struct bkey_i *freespace;
2268
2269                         freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2270                         ret = PTR_ERR_OR_ZERO(freespace);
2271                         if (ret)
2272                                 goto bkey_err;
2273
2274                         bkey_init(&freespace->k);
2275                         freespace->k.type       = KEY_TYPE_set;
2276                         freespace->k.p          = k.k->p;
2277                         freespace->k.size       = k.k->size;
2278
2279                         ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2280                                 bch2_trans_commit(trans, NULL, NULL,
2281                                                   BCH_TRANS_COMMIT_no_enospc);
2282                         if (ret)
2283                                 goto bkey_err;
2284
2285                         bch2_btree_iter_set_pos(&iter, k.k->p);
2286                 }
2287 bkey_err:
2288                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2289                         continue;
2290                 if (ret)
2291                         break;
2292         }
2293
2294         bch2_trans_iter_exit(trans, &iter);
2295         bch2_trans_put(trans);
2296
2297         if (ret < 0) {
2298                 bch_err_msg(ca, ret, "initializing free space");
2299                 return ret;
2300         }
2301
2302         mutex_lock(&c->sb_lock);
2303         m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2304         SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2305         mutex_unlock(&c->sb_lock);
2306
2307         return 0;
2308 }
2309
2310 int bch2_fs_freespace_init(struct bch_fs *c)
2311 {
2312         int ret = 0;
2313         bool doing_init = false;
2314
2315         /*
2316          * We can crash during the device add path, so we need to check this on
2317          * every mount:
2318          */
2319
2320         for_each_member_device(c, ca) {
2321                 if (ca->mi.freespace_initialized)
2322                         continue;
2323
2324                 if (!doing_init) {
2325                         bch_info(c, "initializing freespace");
2326                         doing_init = true;
2327                 }
2328
2329                 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2330                 if (ret) {
2331                         bch2_dev_put(ca);
2332                         bch_err_fn(c, ret);
2333                         return ret;
2334                 }
2335         }
2336
2337         if (doing_init) {
2338                 mutex_lock(&c->sb_lock);
2339                 bch2_write_super(c);
2340                 mutex_unlock(&c->sb_lock);
2341                 bch_verbose(c, "done initializing freespace");
2342         }
2343
2344         return 0;
2345 }
2346
2347 /* device removal */
2348
2349 int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
2350 {
2351         struct bpos start       = POS(ca->dev_idx, 0);
2352         struct bpos end         = POS(ca->dev_idx, U64_MAX);
2353         int ret;
2354
2355         /*
2356          * We clear the LRU and need_discard btrees first so that we don't race
2357          * with bch2_do_invalidates() and bch2_do_discards()
2358          */
2359         ret =   bch2_dev_remove_stripes(c, ca->dev_idx) ?:
2360                 bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
2361                                         BTREE_TRIGGER_norun, NULL) ?:
2362                 bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
2363                                         BTREE_TRIGGER_norun, NULL) ?:
2364                 bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
2365                                         BTREE_TRIGGER_norun, NULL) ?:
2366                 bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
2367                                         BTREE_TRIGGER_norun, NULL) ?:
2368                 bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
2369                                         BTREE_TRIGGER_norun, NULL) ?:
2370                 bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
2371                                         BTREE_TRIGGER_norun, NULL) ?:
2372                 bch2_dev_usage_remove(c, ca->dev_idx);
2373         bch_err_msg(ca, ret, "removing dev alloc info");
2374         return ret;
2375 }
2376
2377 /* Bucket IO clocks: */
2378
2379 static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2380                                 size_t bucket_nr, int rw)
2381 {
2382         struct bch_fs *c = trans->c;
2383
2384         struct btree_iter iter;
2385         struct bkey_i_alloc_v4 *a =
2386                 bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2387         int ret = PTR_ERR_OR_ZERO(a);
2388         if (ret)
2389                 return ret;
2390
2391         u64 now = bch2_current_io_time(c, rw);
2392         if (a->v.io_time[rw] == now)
2393                 goto out;
2394
2395         a->v.io_time[rw] = now;
2396
2397         ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2398                 bch2_trans_commit(trans, NULL, NULL, 0);
2399 out:
2400         bch2_trans_iter_exit(trans, &iter);
2401         return ret;
2402 }
2403
2404 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2405                               size_t bucket_nr, int rw)
2406 {
2407         if (bch2_trans_relock(trans))
2408                 bch2_trans_begin(trans);
2409
2410         return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
2411 }
2412
2413 /* Startup/shutdown (ro/rw): */
2414
2415 void bch2_recalc_capacity(struct bch_fs *c)
2416 {
2417         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2418         unsigned bucket_size_max = 0;
2419         unsigned long ra_pages = 0;
2420
2421         lockdep_assert_held(&c->state_lock);
2422
2423         for_each_online_member(c, ca) {
2424                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2425
2426                 ra_pages += bdi->ra_pages;
2427         }
2428
2429         bch2_set_ra_pages(c, ra_pages);
2430
2431         for_each_rw_member(c, ca) {
2432                 u64 dev_reserve = 0;
2433
2434                 /*
2435                  * We need to reserve buckets (from the number
2436                  * of currently available buckets) against
2437                  * foreground writes so that mainly copygc can
2438                  * make forward progress.
2439                  *
2440                  * We need enough to refill the various reserves
2441                  * from scratch - copygc will use its entire
2442                  * reserve all at once, then run against when
2443                  * its reserve is refilled (from the formerly
2444                  * available buckets).
2445                  *
2446                  * This reserve is just used when considering if
2447                  * allocations for foreground writes must wait -
2448                  * not -ENOSPC calculations.
2449                  */
2450
2451                 dev_reserve += ca->nr_btree_reserve * 2;
2452                 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2453
2454                 dev_reserve += 1;       /* btree write point */
2455                 dev_reserve += 1;       /* copygc write point */
2456                 dev_reserve += 1;       /* rebalance write point */
2457
2458                 dev_reserve *= ca->mi.bucket_size;
2459
2460                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2461                                              ca->mi.first_bucket);
2462
2463                 reserved_sectors += dev_reserve * 2;
2464
2465                 bucket_size_max = max_t(unsigned, bucket_size_max,
2466                                         ca->mi.bucket_size);
2467         }
2468
2469         gc_reserve = c->opts.gc_reserve_bytes
2470                 ? c->opts.gc_reserve_bytes >> 9
2471                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2472
2473         reserved_sectors = max(gc_reserve, reserved_sectors);
2474
2475         reserved_sectors = min(reserved_sectors, capacity);
2476
2477         c->reserved = reserved_sectors;
2478         c->capacity = capacity - reserved_sectors;
2479
2480         c->bucket_size_max = bucket_size_max;
2481
2482         /* Wake up case someone was waiting for buckets */
2483         closure_wake_up(&c->freelist_wait);
2484 }
2485
2486 u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2487 {
2488         u64 ret = U64_MAX;
2489
2490         for_each_rw_member(c, ca)
2491                 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2492         return ret;
2493 }
2494
2495 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2496 {
2497         struct open_bucket *ob;
2498         bool ret = false;
2499
2500         for (ob = c->open_buckets;
2501              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2502              ob++) {
2503                 spin_lock(&ob->lock);
2504                 if (ob->valid && !ob->on_partial_list &&
2505                     ob->dev == ca->dev_idx)
2506                         ret = true;
2507                 spin_unlock(&ob->lock);
2508         }
2509
2510         return ret;
2511 }
2512
2513 /* device goes ro: */
2514 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2515 {
2516         lockdep_assert_held(&c->state_lock);
2517
2518         /* First, remove device from allocation groups: */
2519
2520         for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2521                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
2522
2523         c->rw_devs_change_count++;
2524
2525         /*
2526          * Capacity is calculated based off of devices in allocation groups:
2527          */
2528         bch2_recalc_capacity(c);
2529
2530         bch2_open_buckets_stop(c, ca, false);
2531
2532         /*
2533          * Wake up threads that were blocked on allocation, so they can notice
2534          * the device can no longer be removed and the capacity has changed:
2535          */
2536         closure_wake_up(&c->freelist_wait);
2537
2538         /*
2539          * journal_res_get() can block waiting for free space in the journal -
2540          * it needs to notice there may not be devices to allocate from anymore:
2541          */
2542         wake_up(&c->journal.wait);
2543
2544         /* Now wait for any in flight writes: */
2545
2546         closure_wait_event(&c->open_buckets_wait,
2547                            !bch2_dev_has_open_write_point(c, ca));
2548 }
2549
2550 /* device goes rw: */
2551 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2552 {
2553         lockdep_assert_held(&c->state_lock);
2554
2555         for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2556                 if (ca->mi.data_allowed & (1 << i))
2557                         set_bit(ca->dev_idx, c->rw_devs[i].d);
2558
2559         c->rw_devs_change_count++;
2560 }
2561
2562 void bch2_dev_allocator_background_exit(struct bch_dev *ca)
2563 {
2564         darray_exit(&ca->discard_buckets_in_flight);
2565 }
2566
2567 void bch2_dev_allocator_background_init(struct bch_dev *ca)
2568 {
2569         mutex_init(&ca->discard_buckets_in_flight_lock);
2570         INIT_WORK(&ca->discard_work, bch2_do_discards_work);
2571         INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
2572         INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
2573 }
2574
2575 void bch2_fs_allocator_background_init(struct bch_fs *c)
2576 {
2577         spin_lock_init(&c->freelist_lock);
2578 }
This page took 0.184571 seconds and 4 git commands to generate.