]> Git Repo - J-linux.git/blob - fs/bcachefs/ec.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / fs / bcachefs / ec.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* erasure coding */
4
5 #include "bcachefs.h"
6 #include "alloc_background.h"
7 #include "alloc_foreground.h"
8 #include "backpointers.h"
9 #include "bkey_buf.h"
10 #include "bset.h"
11 #include "btree_gc.h"
12 #include "btree_update.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "disk_accounting.h"
17 #include "disk_groups.h"
18 #include "ec.h"
19 #include "error.h"
20 #include "io_read.h"
21 #include "io_write.h"
22 #include "keylist.h"
23 #include "recovery.h"
24 #include "replicas.h"
25 #include "super-io.h"
26 #include "util.h"
27
28 #include <linux/sort.h>
29
30 #ifdef __KERNEL__
31
32 #include <linux/raid/pq.h>
33 #include <linux/raid/xor.h>
34
35 static void raid5_recov(unsigned disks, unsigned failed_idx,
36                         size_t size, void **data)
37 {
38         unsigned i = 2, nr;
39
40         BUG_ON(failed_idx >= disks);
41
42         swap(data[0], data[failed_idx]);
43         memcpy(data[0], data[1], size);
44
45         while (i < disks) {
46                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
47                 xor_blocks(nr, size, data[0], data + i);
48                 i += nr;
49         }
50
51         swap(data[0], data[failed_idx]);
52 }
53
54 static void raid_gen(int nd, int np, size_t size, void **v)
55 {
56         if (np >= 1)
57                 raid5_recov(nd + np, nd, size, v);
58         if (np >= 2)
59                 raid6_call.gen_syndrome(nd + np, size, v);
60         BUG_ON(np > 2);
61 }
62
63 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
64 {
65         switch (nr) {
66         case 0:
67                 break;
68         case 1:
69                 if (ir[0] < nd + 1)
70                         raid5_recov(nd + 1, ir[0], size, v);
71                 else
72                         raid6_call.gen_syndrome(nd + np, size, v);
73                 break;
74         case 2:
75                 if (ir[1] < nd) {
76                         /* data+data failure. */
77                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
78                 } else if (ir[0] < nd) {
79                         /* data + p/q failure */
80
81                         if (ir[1] == nd) /* data + p failure */
82                                 raid6_datap_recov(nd + np, size, ir[0], v);
83                         else { /* data + q failure */
84                                 raid5_recov(nd + 1, ir[0], size, v);
85                                 raid6_call.gen_syndrome(nd + np, size, v);
86                         }
87                 } else {
88                         raid_gen(nd, np, size, v);
89                 }
90                 break;
91         default:
92                 BUG();
93         }
94 }
95
96 #else
97
98 #include <raid/raid.h>
99
100 #endif
101
102 struct ec_bio {
103         struct bch_dev          *ca;
104         struct ec_stripe_buf    *buf;
105         size_t                  idx;
106         struct bio              bio;
107 };
108
109 /* Stripes btree keys: */
110
111 int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
112                          enum bch_validate_flags flags)
113 {
114         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
115         int ret = 0;
116
117         bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
118                          bpos_gt(k.k->p, POS(0, U32_MAX)),
119                          c, stripe_pos_bad,
120                          "stripe at bad pos");
121
122         bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
123                          c, stripe_val_size_bad,
124                          "incorrect value size (%zu < %u)",
125                          bkey_val_u64s(k.k), stripe_val_u64s(s));
126
127         bkey_fsck_err_on(s->csum_granularity_bits >= 64,
128                          c, stripe_csum_granularity_bad,
129                          "invalid csum granularity (%u >= 64)",
130                          s->csum_granularity_bits);
131
132         ret = bch2_bkey_ptrs_validate(c, k, flags);
133 fsck_err:
134         return ret;
135 }
136
137 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
138                          struct bkey_s_c k)
139 {
140         const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
141         struct bch_stripe s = {};
142
143         memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
144
145         unsigned nr_data = s.nr_blocks - s.nr_redundant;
146
147         prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
148                    s.algorithm,
149                    le16_to_cpu(s.sectors),
150                    nr_data,
151                    s.nr_redundant);
152         bch2_prt_csum_type(out, s.csum_type);
153         prt_str(out, " gran ");
154         if (s.csum_granularity_bits < 64)
155                 prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
156         else
157                 prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
158
159         if (s.disk_label) {
160                 prt_str(out, " label");
161                 bch2_disk_path_to_text(out, c, s.disk_label - 1);
162         }
163
164         for (unsigned i = 0; i < s.nr_blocks; i++) {
165                 const struct bch_extent_ptr *ptr = sp->ptrs + i;
166
167                 if ((void *) ptr >= bkey_val_end(k))
168                         break;
169
170                 prt_char(out, ' ');
171                 bch2_extent_ptr_to_text(out, c, ptr);
172
173                 if (s.csum_type < BCH_CSUM_NR &&
174                     i < nr_data &&
175                     stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
176                         prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
177         }
178 }
179
180 /* Triggers: */
181
182 static int __mark_stripe_bucket(struct btree_trans *trans,
183                                 struct bch_dev *ca,
184                                 struct bkey_s_c_stripe s,
185                                 unsigned ptr_idx, bool deleting,
186                                 struct bpos bucket,
187                                 struct bch_alloc_v4 *a,
188                                 enum btree_iter_update_trigger_flags flags)
189 {
190         const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
191         unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
192         bool parity = ptr_idx >= nr_data;
193         enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
194         s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
195         struct printbuf buf = PRINTBUF;
196         int ret = 0;
197
198         struct bch_fs *c = trans->c;
199         if (deleting)
200                 sectors = -sectors;
201
202         if (!deleting) {
203                 if (bch2_trans_inconsistent_on(a->stripe ||
204                                                a->stripe_redundancy, trans,
205                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
206                                 bucket.inode, bucket.offset, a->gen,
207                                 bch2_data_type_str(a->data_type),
208                                 a->dirty_sectors,
209                                 a->stripe, s.k->p.offset,
210                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
211                         ret = -BCH_ERR_mark_stripe;
212                         goto err;
213                 }
214
215                 if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
216                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
217                                 bucket.inode, bucket.offset, a->gen,
218                                 bch2_data_type_str(a->data_type),
219                                 a->dirty_sectors,
220                                 a->cached_sectors,
221                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
222                         ret = -BCH_ERR_mark_stripe;
223                         goto err;
224                 }
225         } else {
226                 if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
227                                                a->stripe_redundancy != s.v->nr_redundant, trans,
228                                 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
229                                 bucket.inode, bucket.offset, a->gen,
230                                 a->stripe,
231                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
232                         ret = -BCH_ERR_mark_stripe;
233                         goto err;
234                 }
235
236                 if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
237                                 "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
238                                 bucket.inode, bucket.offset, a->gen,
239                                 bch2_data_type_str(a->data_type),
240                                 bch2_data_type_str(data_type),
241                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
242                         ret = -BCH_ERR_mark_stripe;
243                         goto err;
244                 }
245
246                 if (bch2_trans_inconsistent_on(parity &&
247                                                (a->dirty_sectors != -sectors ||
248                                                 a->cached_sectors), trans,
249                                 "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
250                                 bucket.inode, bucket.offset, a->gen,
251                                 a->dirty_sectors,
252                                 a->cached_sectors,
253                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
254                         ret = -BCH_ERR_mark_stripe;
255                         goto err;
256                 }
257         }
258
259         if (sectors) {
260                 ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
261                                              a->gen, a->data_type, &a->dirty_sectors);
262                 if (ret)
263                         goto err;
264         }
265
266         if (!deleting) {
267                 a->stripe               = s.k->p.offset;
268                 a->stripe_redundancy    = s.v->nr_redundant;
269                 alloc_data_type_set(a, data_type);
270         } else {
271                 a->stripe               = 0;
272                 a->stripe_redundancy    = 0;
273                 alloc_data_type_set(a, BCH_DATA_user);
274         }
275 err:
276         printbuf_exit(&buf);
277         return ret;
278 }
279
280 static int mark_stripe_bucket(struct btree_trans *trans,
281                               struct bkey_s_c_stripe s,
282                               unsigned ptr_idx, bool deleting,
283                               enum btree_iter_update_trigger_flags flags)
284 {
285         struct bch_fs *c = trans->c;
286         const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
287         struct printbuf buf = PRINTBUF;
288         int ret = 0;
289
290         struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
291         if (unlikely(!ca)) {
292                 if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
293                         ret = -BCH_ERR_mark_stripe;
294                 goto err;
295         }
296
297         struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
298
299         if (flags & BTREE_TRIGGER_transactional) {
300                 struct bkey_i_alloc_v4 *a =
301                         bch2_trans_start_alloc_update(trans, bucket, 0);
302                 ret = PTR_ERR_OR_ZERO(a) ?:
303                         __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
304         }
305
306         if (flags & BTREE_TRIGGER_gc) {
307                 percpu_down_read(&c->mark_lock);
308                 struct bucket *g = gc_bucket(ca, bucket.offset);
309                 if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
310                                             ptr->dev,
311                                             (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
312                         ret = -BCH_ERR_mark_stripe;
313                         goto err_unlock;
314                 }
315
316                 bucket_lock(g);
317                 struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
318                 ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
319                 alloc_to_bucket(g, new);
320                 bucket_unlock(g);
321 err_unlock:
322                 percpu_up_read(&c->mark_lock);
323                 if (!ret)
324                         ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
325         }
326 err:
327         bch2_dev_put(ca);
328         printbuf_exit(&buf);
329         return ret;
330 }
331
332 static int mark_stripe_buckets(struct btree_trans *trans,
333                                struct bkey_s_c old, struct bkey_s_c new,
334                                enum btree_iter_update_trigger_flags flags)
335 {
336         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
337                 ? bkey_s_c_to_stripe(old).v : NULL;
338         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
339                 ? bkey_s_c_to_stripe(new).v : NULL;
340
341         BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
342
343         unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
344
345         for (unsigned i = 0; i < nr_blocks; i++) {
346                 if (new_s && old_s &&
347                     !memcmp(&new_s->ptrs[i],
348                             &old_s->ptrs[i],
349                             sizeof(new_s->ptrs[i])))
350                         continue;
351
352                 if (new_s) {
353                         int ret = mark_stripe_bucket(trans,
354                                         bkey_s_c_to_stripe(new), i, false, flags);
355                         if (ret)
356                                 return ret;
357                 }
358
359                 if (old_s) {
360                         int ret = mark_stripe_bucket(trans,
361                                         bkey_s_c_to_stripe(old), i, true, flags);
362                         if (ret)
363                                 return ret;
364                 }
365         }
366
367         return 0;
368 }
369
370 static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
371 {
372         m->sectors      = le16_to_cpu(s->sectors);
373         m->algorithm    = s->algorithm;
374         m->nr_blocks    = s->nr_blocks;
375         m->nr_redundant = s->nr_redundant;
376         m->disk_label   = s->disk_label;
377         m->blocks_nonempty = 0;
378
379         for (unsigned i = 0; i < s->nr_blocks; i++)
380                 m->blocks_nonempty += !!stripe_blockcount_get(s, i);
381 }
382
383 int bch2_trigger_stripe(struct btree_trans *trans,
384                         enum btree_id btree, unsigned level,
385                         struct bkey_s_c old, struct bkey_s _new,
386                         enum btree_iter_update_trigger_flags flags)
387 {
388         struct bkey_s_c new = _new.s_c;
389         struct bch_fs *c = trans->c;
390         u64 idx = new.k->p.offset;
391         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
392                 ? bkey_s_c_to_stripe(old).v : NULL;
393         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
394                 ? bkey_s_c_to_stripe(new).v : NULL;
395
396         if (unlikely(flags & BTREE_TRIGGER_check_repair))
397                 return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
398
399         BUG_ON(new_s && old_s &&
400                (new_s->nr_blocks        != old_s->nr_blocks ||
401                 new_s->nr_redundant     != old_s->nr_redundant));
402
403
404         if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
405                 /*
406                  * If the pointers aren't changing, we don't need to do anything:
407                  */
408                 if (new_s && old_s &&
409                     new_s->nr_blocks    == old_s->nr_blocks &&
410                     new_s->nr_redundant == old_s->nr_redundant &&
411                     !memcmp(old_s->ptrs, new_s->ptrs,
412                             new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
413                         return 0;
414
415                 struct gc_stripe *gc = NULL;
416                 if (flags & BTREE_TRIGGER_gc) {
417                         gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
418                         if (!gc) {
419                                 bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
420                                 return -BCH_ERR_ENOMEM_mark_stripe;
421                         }
422
423                         /*
424                          * This will be wrong when we bring back runtime gc: we should
425                          * be unmarking the old key and then marking the new key
426                          *
427                          * Also: when we bring back runtime gc, locking
428                          */
429                         gc->alive       = true;
430                         gc->sectors     = le16_to_cpu(new_s->sectors);
431                         gc->nr_blocks   = new_s->nr_blocks;
432                         gc->nr_redundant        = new_s->nr_redundant;
433
434                         for (unsigned i = 0; i < new_s->nr_blocks; i++)
435                                 gc->ptrs[i] = new_s->ptrs[i];
436
437                         /*
438                          * gc recalculates this field from stripe ptr
439                          * references:
440                          */
441                         memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
442                 }
443
444                 if (new_s) {
445                         s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
446
447                         struct disk_accounting_pos acc = {
448                                 .type = BCH_DISK_ACCOUNTING_replicas,
449                         };
450                         bch2_bkey_to_replicas(&acc.replicas, new);
451                         int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
452                         if (ret)
453                                 return ret;
454
455                         if (gc)
456                                 memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
457                 }
458
459                 if (old_s) {
460                         s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
461
462                         struct disk_accounting_pos acc = {
463                                 .type = BCH_DISK_ACCOUNTING_replicas,
464                         };
465                         bch2_bkey_to_replicas(&acc.replicas, old);
466                         int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
467                         if (ret)
468                                 return ret;
469                 }
470
471                 int ret = mark_stripe_buckets(trans, old, new, flags);
472                 if (ret)
473                         return ret;
474         }
475
476         if (flags & BTREE_TRIGGER_atomic) {
477                 struct stripe *m = genradix_ptr(&c->stripes, idx);
478
479                 if (!m) {
480                         struct printbuf buf1 = PRINTBUF;
481                         struct printbuf buf2 = PRINTBUF;
482
483                         bch2_bkey_val_to_text(&buf1, c, old);
484                         bch2_bkey_val_to_text(&buf2, c, new);
485                         bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
486                                             "old %s\n"
487                                             "new %s", idx, buf1.buf, buf2.buf);
488                         printbuf_exit(&buf2);
489                         printbuf_exit(&buf1);
490                         bch2_inconsistent_error(c);
491                         return -1;
492                 }
493
494                 if (!new_s) {
495                         bch2_stripes_heap_del(c, m, idx);
496
497                         memset(m, 0, sizeof(*m));
498                 } else {
499                         stripe_to_mem(m, new_s);
500
501                         if (!old_s)
502                                 bch2_stripes_heap_insert(c, m, idx);
503                         else
504                                 bch2_stripes_heap_update(c, m, idx);
505                 }
506         }
507
508         return 0;
509 }
510
511 /* returns blocknr in stripe that we matched: */
512 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
513                                                 struct bkey_s_c k, unsigned *block)
514 {
515         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
516         unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
517
518         bkey_for_each_ptr(ptrs, ptr)
519                 for (i = 0; i < nr_data; i++)
520                         if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
521                                                       le16_to_cpu(s->sectors))) {
522                                 *block = i;
523                                 return ptr;
524                         }
525
526         return NULL;
527 }
528
529 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
530 {
531         switch (k.k->type) {
532         case KEY_TYPE_extent: {
533                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
534                 const union bch_extent_entry *entry;
535
536                 extent_for_each_entry(e, entry)
537                         if (extent_entry_type(entry) ==
538                             BCH_EXTENT_ENTRY_stripe_ptr &&
539                             entry->stripe_ptr.idx == idx)
540                                 return true;
541
542                 break;
543         }
544         }
545
546         return false;
547 }
548
549 /* Stripe bufs: */
550
551 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
552 {
553         if (buf->key.k.type == KEY_TYPE_stripe) {
554                 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
555                 unsigned i;
556
557                 for (i = 0; i < s->v.nr_blocks; i++) {
558                         kvfree(buf->data[i]);
559                         buf->data[i] = NULL;
560                 }
561         }
562 }
563
564 /* XXX: this is a non-mempoolified memory allocation: */
565 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
566                               unsigned offset, unsigned size)
567 {
568         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
569         unsigned csum_granularity = 1U << v->csum_granularity_bits;
570         unsigned end = offset + size;
571         unsigned i;
572
573         BUG_ON(end > le16_to_cpu(v->sectors));
574
575         offset  = round_down(offset, csum_granularity);
576         end     = min_t(unsigned, le16_to_cpu(v->sectors),
577                         round_up(end, csum_granularity));
578
579         buf->offset     = offset;
580         buf->size       = end - offset;
581
582         memset(buf->valid, 0xFF, sizeof(buf->valid));
583
584         for (i = 0; i < v->nr_blocks; i++) {
585                 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
586                 if (!buf->data[i])
587                         goto err;
588         }
589
590         return 0;
591 err:
592         ec_stripe_buf_exit(buf);
593         return -BCH_ERR_ENOMEM_stripe_buf;
594 }
595
596 /* Checksumming: */
597
598 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
599                                          unsigned block, unsigned offset)
600 {
601         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
602         unsigned csum_granularity = 1 << v->csum_granularity_bits;
603         unsigned end = buf->offset + buf->size;
604         unsigned len = min(csum_granularity, end - offset);
605
606         BUG_ON(offset >= end);
607         BUG_ON(offset <  buf->offset);
608         BUG_ON(offset & (csum_granularity - 1));
609         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
610                (len & (csum_granularity - 1)));
611
612         return bch2_checksum(NULL, v->csum_type,
613                              null_nonce(),
614                              buf->data[block] + ((offset - buf->offset) << 9),
615                              len << 9);
616 }
617
618 static void ec_generate_checksums(struct ec_stripe_buf *buf)
619 {
620         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
621         unsigned i, j, csums_per_device = stripe_csums_per_device(v);
622
623         if (!v->csum_type)
624                 return;
625
626         BUG_ON(buf->offset);
627         BUG_ON(buf->size != le16_to_cpu(v->sectors));
628
629         for (i = 0; i < v->nr_blocks; i++)
630                 for (j = 0; j < csums_per_device; j++)
631                         stripe_csum_set(v, i, j,
632                                 ec_block_checksum(buf, i, j << v->csum_granularity_bits));
633 }
634
635 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
636 {
637         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
638         unsigned csum_granularity = 1 << v->csum_granularity_bits;
639         unsigned i;
640
641         if (!v->csum_type)
642                 return;
643
644         for (i = 0; i < v->nr_blocks; i++) {
645                 unsigned offset = buf->offset;
646                 unsigned end = buf->offset + buf->size;
647
648                 if (!test_bit(i, buf->valid))
649                         continue;
650
651                 while (offset < end) {
652                         unsigned j = offset >> v->csum_granularity_bits;
653                         unsigned len = min(csum_granularity, end - offset);
654                         struct bch_csum want = stripe_csum_get(v, i, j);
655                         struct bch_csum got = ec_block_checksum(buf, i, offset);
656
657                         if (bch2_crc_cmp(want, got)) {
658                                 struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
659                                 if (ca) {
660                                         struct printbuf err = PRINTBUF;
661
662                                         prt_str(&err, "stripe ");
663                                         bch2_csum_err_msg(&err, v->csum_type, want, got);
664                                         prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
665                                         bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
666                                         bch_err_ratelimited(ca, "%s", err.buf);
667                                         printbuf_exit(&err);
668
669                                         bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
670                                 }
671
672                                 clear_bit(i, buf->valid);
673                                 break;
674                         }
675
676                         offset += len;
677                 }
678         }
679 }
680
681 /* Erasure coding: */
682
683 static void ec_generate_ec(struct ec_stripe_buf *buf)
684 {
685         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
686         unsigned nr_data = v->nr_blocks - v->nr_redundant;
687         unsigned bytes = le16_to_cpu(v->sectors) << 9;
688
689         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
690 }
691
692 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
693 {
694         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
695
696         return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
697 }
698
699 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
700 {
701         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
702         unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
703         unsigned nr_data = v->nr_blocks - v->nr_redundant;
704         unsigned bytes = buf->size << 9;
705
706         if (ec_nr_failed(buf) > v->nr_redundant) {
707                 bch_err_ratelimited(c,
708                         "error doing reconstruct read: unable to read enough blocks");
709                 return -1;
710         }
711
712         for (i = 0; i < nr_data; i++)
713                 if (!test_bit(i, buf->valid))
714                         failed[nr_failed++] = i;
715
716         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
717         return 0;
718 }
719
720 /* IO: */
721
722 static void ec_block_endio(struct bio *bio)
723 {
724         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
725         struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
726         struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
727         struct bch_dev *ca = ec_bio->ca;
728         struct closure *cl = bio->bi_private;
729
730         if (bch2_dev_io_err_on(bio->bi_status, ca,
731                                bio_data_dir(bio)
732                                ? BCH_MEMBER_ERROR_write
733                                : BCH_MEMBER_ERROR_read,
734                                "erasure coding %s error: %s",
735                                bio_data_dir(bio) ? "write" : "read",
736                                bch2_blk_status_to_str(bio->bi_status)))
737                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
738
739         int stale = dev_ptr_stale(ca, ptr);
740         if (stale) {
741                 bch_err_ratelimited(ca->fs,
742                                     "error %s stripe: stale/invalid pointer (%i) after io",
743                                     bio_data_dir(bio) == READ ? "reading from" : "writing to",
744                                     stale);
745                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
746         }
747
748         bio_put(&ec_bio->bio);
749         percpu_ref_put(&ca->io_ref);
750         closure_put(cl);
751 }
752
753 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
754                         blk_opf_t opf, unsigned idx, struct closure *cl)
755 {
756         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
757         unsigned offset = 0, bytes = buf->size << 9;
758         struct bch_extent_ptr *ptr = &v->ptrs[idx];
759         enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
760                 ? BCH_DATA_user
761                 : BCH_DATA_parity;
762         int rw = op_is_write(opf);
763
764         struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
765         if (!ca) {
766                 clear_bit(idx, buf->valid);
767                 return;
768         }
769
770         int stale = dev_ptr_stale(ca, ptr);
771         if (stale) {
772                 bch_err_ratelimited(c,
773                                     "error %s stripe: stale pointer (%i)",
774                                     rw == READ ? "reading from" : "writing to",
775                                     stale);
776                 clear_bit(idx, buf->valid);
777                 return;
778         }
779
780
781         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
782
783         while (offset < bytes) {
784                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
785                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
786                 unsigned b = min_t(size_t, bytes - offset,
787                                    nr_iovecs << PAGE_SHIFT);
788                 struct ec_bio *ec_bio;
789
790                 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
791                                                        nr_iovecs,
792                                                        opf,
793                                                        GFP_KERNEL,
794                                                        &c->ec_bioset),
795                                       struct ec_bio, bio);
796
797                 ec_bio->ca                      = ca;
798                 ec_bio->buf                     = buf;
799                 ec_bio->idx                     = idx;
800
801                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
802                 ec_bio->bio.bi_end_io           = ec_block_endio;
803                 ec_bio->bio.bi_private          = cl;
804
805                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
806
807                 closure_get(cl);
808                 percpu_ref_get(&ca->io_ref);
809
810                 submit_bio(&ec_bio->bio);
811
812                 offset += b;
813         }
814
815         percpu_ref_put(&ca->io_ref);
816 }
817
818 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
819                                 struct ec_stripe_buf *stripe)
820 {
821         struct btree_iter iter;
822         struct bkey_s_c k;
823         int ret;
824
825         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
826                                POS(0, idx), BTREE_ITER_slots);
827         ret = bkey_err(k);
828         if (ret)
829                 goto err;
830         if (k.k->type != KEY_TYPE_stripe) {
831                 ret = -ENOENT;
832                 goto err;
833         }
834         bkey_reassemble(&stripe->key, k);
835 err:
836         bch2_trans_iter_exit(trans, &iter);
837         return ret;
838 }
839
840 /* recovery read path: */
841 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
842                         struct bkey_s_c orig_k)
843 {
844         struct bch_fs *c = trans->c;
845         struct ec_stripe_buf *buf = NULL;
846         struct closure cl;
847         struct bch_stripe *v;
848         unsigned i, offset;
849         const char *msg = NULL;
850         struct printbuf msgbuf = PRINTBUF;
851         int ret = 0;
852
853         closure_init_stack(&cl);
854
855         BUG_ON(!rbio->pick.has_ec);
856
857         buf = kzalloc(sizeof(*buf), GFP_NOFS);
858         if (!buf)
859                 return -BCH_ERR_ENOMEM_ec_read_extent;
860
861         ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
862         if (ret) {
863                 msg = "stripe not found";
864                 goto err;
865         }
866
867         v = &bkey_i_to_stripe(&buf->key)->v;
868
869         if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
870                 msg = "pointer doesn't match stripe";
871                 goto err;
872         }
873
874         offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
875         if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
876                 msg = "read is bigger than stripe";
877                 goto err;
878         }
879
880         ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
881         if (ret) {
882                 msg = "-ENOMEM";
883                 goto err;
884         }
885
886         for (i = 0; i < v->nr_blocks; i++)
887                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
888
889         closure_sync(&cl);
890
891         if (ec_nr_failed(buf) > v->nr_redundant) {
892                 msg = "unable to read enough blocks";
893                 goto err;
894         }
895
896         ec_validate_checksums(c, buf);
897
898         ret = ec_do_recov(c, buf);
899         if (ret)
900                 goto err;
901
902         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
903                       buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
904 out:
905         ec_stripe_buf_exit(buf);
906         kfree(buf);
907         return ret;
908 err:
909         bch2_bkey_val_to_text(&msgbuf, c, orig_k);
910         bch_err_ratelimited(c,
911                             "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
912         printbuf_exit(&msgbuf);;
913         ret = -BCH_ERR_stripe_reconstruct;
914         goto out;
915 }
916
917 /* stripe bucket accounting: */
918
919 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
920 {
921         ec_stripes_heap n, *h = &c->ec_stripes_heap;
922
923         if (idx >= h->size) {
924                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
925                         return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
926
927                 mutex_lock(&c->ec_stripes_heap_lock);
928                 if (n.size > h->size) {
929                         memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
930                         n.nr = h->nr;
931                         swap(*h, n);
932                 }
933                 mutex_unlock(&c->ec_stripes_heap_lock);
934
935                 free_heap(&n);
936         }
937
938         if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
939                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
940
941         if (c->gc_pos.phase != GC_PHASE_not_running &&
942             !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
943                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
944
945         return 0;
946 }
947
948 static int ec_stripe_mem_alloc(struct btree_trans *trans,
949                                struct btree_iter *iter)
950 {
951         return allocate_dropping_locks_errcode(trans,
952                         __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
953 }
954
955 /*
956  * Hash table of open stripes:
957  * Stripes that are being created or modified are kept in a hash table, so that
958  * stripe deletion can skip them.
959  */
960
961 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
962 {
963         unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
964         struct ec_stripe_new *s;
965
966         hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
967                 if (s->idx == idx)
968                         return true;
969         return false;
970 }
971
972 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
973 {
974         bool ret = false;
975
976         spin_lock(&c->ec_stripes_new_lock);
977         ret = __bch2_stripe_is_open(c, idx);
978         spin_unlock(&c->ec_stripes_new_lock);
979
980         return ret;
981 }
982
983 static bool bch2_try_open_stripe(struct bch_fs *c,
984                                  struct ec_stripe_new *s,
985                                  u64 idx)
986 {
987         bool ret;
988
989         spin_lock(&c->ec_stripes_new_lock);
990         ret = !__bch2_stripe_is_open(c, idx);
991         if (ret) {
992                 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
993
994                 s->idx = idx;
995                 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
996         }
997         spin_unlock(&c->ec_stripes_new_lock);
998
999         return ret;
1000 }
1001
1002 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
1003 {
1004         BUG_ON(!s->idx);
1005
1006         spin_lock(&c->ec_stripes_new_lock);
1007         hlist_del_init(&s->hash);
1008         spin_unlock(&c->ec_stripes_new_lock);
1009
1010         s->idx = 0;
1011 }
1012
1013 /* Heap of all existing stripes, ordered by blocks_nonempty */
1014
1015 static u64 stripe_idx_to_delete(struct bch_fs *c)
1016 {
1017         ec_stripes_heap *h = &c->ec_stripes_heap;
1018
1019         lockdep_assert_held(&c->ec_stripes_heap_lock);
1020
1021         if (h->nr &&
1022             h->data[0].blocks_nonempty == 0 &&
1023             !bch2_stripe_is_open(c, h->data[0].idx))
1024                 return h->data[0].idx;
1025
1026         return 0;
1027 }
1028
1029 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
1030                                                    size_t i)
1031 {
1032         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
1033
1034         genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
1035 }
1036
1037 static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
1038 {
1039         struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
1040         struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
1041
1042         return ((_l->blocks_nonempty > _r->blocks_nonempty) <
1043                 (_l->blocks_nonempty < _r->blocks_nonempty));
1044 }
1045
1046 static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
1047 {
1048         struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
1049         struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
1050         ec_stripes_heap *_h = (ec_stripes_heap *)h;
1051         size_t i = _l - _h->data;
1052         size_t j = _r - _h->data;
1053
1054         swap(*_l, *_r);
1055
1056         ec_stripes_heap_set_backpointer(_h, i);
1057         ec_stripes_heap_set_backpointer(_h, j);
1058 }
1059
1060 static const struct min_heap_callbacks callbacks = {
1061         .less = ec_stripes_heap_cmp,
1062         .swp = ec_stripes_heap_swap,
1063 };
1064
1065 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
1066 {
1067         ec_stripes_heap *h = &c->ec_stripes_heap;
1068         struct stripe *m = genradix_ptr(&c->stripes, idx);
1069
1070         BUG_ON(m->heap_idx >= h->nr);
1071         BUG_ON(h->data[m->heap_idx].idx != idx);
1072 }
1073
1074 void bch2_stripes_heap_del(struct bch_fs *c,
1075                            struct stripe *m, size_t idx)
1076 {
1077         mutex_lock(&c->ec_stripes_heap_lock);
1078         heap_verify_backpointer(c, idx);
1079
1080         min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
1081         mutex_unlock(&c->ec_stripes_heap_lock);
1082 }
1083
1084 void bch2_stripes_heap_insert(struct bch_fs *c,
1085                               struct stripe *m, size_t idx)
1086 {
1087         mutex_lock(&c->ec_stripes_heap_lock);
1088         BUG_ON(min_heap_full(&c->ec_stripes_heap));
1089
1090         genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
1091         min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
1092                         .idx = idx,
1093                         .blocks_nonempty = m->blocks_nonempty,
1094                 }),
1095                 &callbacks,
1096                 &c->ec_stripes_heap);
1097
1098         heap_verify_backpointer(c, idx);
1099         mutex_unlock(&c->ec_stripes_heap_lock);
1100 }
1101
1102 void bch2_stripes_heap_update(struct bch_fs *c,
1103                               struct stripe *m, size_t idx)
1104 {
1105         ec_stripes_heap *h = &c->ec_stripes_heap;
1106         bool do_deletes;
1107         size_t i;
1108
1109         mutex_lock(&c->ec_stripes_heap_lock);
1110         heap_verify_backpointer(c, idx);
1111
1112         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
1113
1114         i = m->heap_idx;
1115         min_heap_sift_up(h,     i, &callbacks, &c->ec_stripes_heap);
1116         min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
1117
1118         heap_verify_backpointer(c, idx);
1119
1120         do_deletes = stripe_idx_to_delete(c) != 0;
1121         mutex_unlock(&c->ec_stripes_heap_lock);
1122
1123         if (do_deletes)
1124                 bch2_do_stripe_deletes(c);
1125 }
1126
1127 /* stripe deletion */
1128
1129 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
1130 {
1131         struct bch_fs *c = trans->c;
1132         struct btree_iter iter;
1133         struct bkey_s_c k;
1134         struct bkey_s_c_stripe s;
1135         int ret;
1136
1137         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
1138                                BTREE_ITER_intent);
1139         ret = bkey_err(k);
1140         if (ret)
1141                 goto err;
1142
1143         if (k.k->type != KEY_TYPE_stripe) {
1144                 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
1145                 ret = -EINVAL;
1146                 goto err;
1147         }
1148
1149         s = bkey_s_c_to_stripe(k);
1150         for (unsigned i = 0; i < s.v->nr_blocks; i++)
1151                 if (stripe_blockcount_get(s.v, i)) {
1152                         struct printbuf buf = PRINTBUF;
1153
1154                         bch2_bkey_val_to_text(&buf, c, k);
1155                         bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
1156                         printbuf_exit(&buf);
1157                         ret = -EINVAL;
1158                         goto err;
1159                 }
1160
1161         ret = bch2_btree_delete_at(trans, &iter, 0);
1162 err:
1163         bch2_trans_iter_exit(trans, &iter);
1164         return ret;
1165 }
1166
1167 static void ec_stripe_delete_work(struct work_struct *work)
1168 {
1169         struct bch_fs *c =
1170                 container_of(work, struct bch_fs, ec_stripe_delete_work);
1171
1172         while (1) {
1173                 mutex_lock(&c->ec_stripes_heap_lock);
1174                 u64 idx = stripe_idx_to_delete(c);
1175                 mutex_unlock(&c->ec_stripes_heap_lock);
1176
1177                 if (!idx)
1178                         break;
1179
1180                 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1181                                         ec_stripe_delete(trans, idx));
1182                 bch_err_fn(c, ret);
1183                 if (ret)
1184                         break;
1185         }
1186
1187         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1188 }
1189
1190 void bch2_do_stripe_deletes(struct bch_fs *c)
1191 {
1192         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1193             !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1194                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1195 }
1196
1197 /* stripe creation: */
1198
1199 static int ec_stripe_key_update(struct btree_trans *trans,
1200                                 struct bkey_i_stripe *old,
1201                                 struct bkey_i_stripe *new)
1202 {
1203         struct bch_fs *c = trans->c;
1204         bool create = !old;
1205
1206         struct btree_iter iter;
1207         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1208                                                new->k.p, BTREE_ITER_intent);
1209         int ret = bkey_err(k);
1210         if (ret)
1211                 goto err;
1212
1213         if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
1214                                     c, "error %s stripe: got existing key type %s",
1215                                     create ? "creating" : "updating",
1216                                     bch2_bkey_types[k.k->type])) {
1217                 ret = -EINVAL;
1218                 goto err;
1219         }
1220
1221         if (k.k->type == KEY_TYPE_stripe) {
1222                 const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
1223
1224                 BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
1225                 BUG_ON(old->v.nr_blocks != v->nr_blocks);
1226
1227                 for (unsigned i = 0; i < new->v.nr_blocks; i++) {
1228                         unsigned sectors = stripe_blockcount_get(v, i);
1229
1230                         if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
1231                                 struct printbuf buf = PRINTBUF;
1232
1233                                 prt_printf(&buf, "stripe changed nonempty block %u", i);
1234                                 prt_str(&buf, "\nold: ");
1235                                 bch2_bkey_val_to_text(&buf, c, k);
1236                                 prt_str(&buf, "\nnew: ");
1237                                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
1238                                 bch2_fs_inconsistent(c, "%s", buf.buf);
1239                                 printbuf_exit(&buf);
1240                                 ret = -EINVAL;
1241                                 goto err;
1242                         }
1243
1244                         /*
1245                          * If the stripe ptr changed underneath us, it must have
1246                          * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
1247                          */
1248                         if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
1249                                 BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
1250
1251                                 if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
1252                                         new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
1253                         }
1254
1255                         stripe_blockcount_set(&new->v, i, sectors);
1256                 }
1257         }
1258
1259         ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1260 err:
1261         bch2_trans_iter_exit(trans, &iter);
1262         return ret;
1263 }
1264
1265 static int ec_stripe_update_extent(struct btree_trans *trans,
1266                                    struct bch_dev *ca,
1267                                    struct bpos bucket, u8 gen,
1268                                    struct ec_stripe_buf *s,
1269                                    struct bpos *bp_pos)
1270 {
1271         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1272         struct bch_fs *c = trans->c;
1273         struct bch_backpointer bp;
1274         struct btree_iter iter;
1275         struct bkey_s_c k;
1276         const struct bch_extent_ptr *ptr_c;
1277         struct bch_extent_ptr *ec_ptr = NULL;
1278         struct bch_extent_stripe_ptr stripe_ptr;
1279         struct bkey_i *n;
1280         int ret, dev, block;
1281
1282         ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
1283                                 bp_pos, &bp, BTREE_ITER_cached);
1284         if (ret)
1285                 return ret;
1286         if (bpos_eq(*bp_pos, SPOS_MAX))
1287                 return 0;
1288
1289         if (bp.level) {
1290                 struct printbuf buf = PRINTBUF;
1291                 struct btree_iter node_iter;
1292                 struct btree *b;
1293
1294                 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
1295                 bch2_trans_iter_exit(trans, &node_iter);
1296
1297                 if (!b)
1298                         return 0;
1299
1300                 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1301                 bch2_backpointer_to_text(&buf, &bp);
1302
1303                 bch2_fs_inconsistent(c, "%s", buf.buf);
1304                 printbuf_exit(&buf);
1305                 return -EIO;
1306         }
1307
1308         k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
1309         ret = bkey_err(k);
1310         if (ret)
1311                 return ret;
1312         if (!k.k) {
1313                 /*
1314                  * extent no longer exists - we could flush the btree
1315                  * write buffer and retry to verify, but no need:
1316                  */
1317                 return 0;
1318         }
1319
1320         if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1321                 goto out;
1322
1323         ptr_c = bkey_matches_stripe(v, k, &block);
1324         /*
1325          * It doesn't generally make sense to erasure code cached ptrs:
1326          * XXX: should we be incrementing a counter?
1327          */
1328         if (!ptr_c || ptr_c->cached)
1329                 goto out;
1330
1331         dev = v->ptrs[block].dev;
1332
1333         n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1334         ret = PTR_ERR_OR_ZERO(n);
1335         if (ret)
1336                 goto out;
1337
1338         bkey_reassemble(n, k);
1339
1340         bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
1341         ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1342         BUG_ON(!ec_ptr);
1343
1344         stripe_ptr = (struct bch_extent_stripe_ptr) {
1345                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1346                 .block          = block,
1347                 .redundancy     = v->nr_redundant,
1348                 .idx            = s->key.k.p.offset,
1349         };
1350
1351         __extent_entry_insert(n,
1352                         (union bch_extent_entry *) ec_ptr,
1353                         (union bch_extent_entry *) &stripe_ptr);
1354
1355         ret = bch2_trans_update(trans, &iter, n, 0);
1356 out:
1357         bch2_trans_iter_exit(trans, &iter);
1358         return ret;
1359 }
1360
1361 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1362                                    unsigned block)
1363 {
1364         struct bch_fs *c = trans->c;
1365         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1366         struct bch_extent_ptr ptr = v->ptrs[block];
1367         struct bpos bp_pos = POS_MIN;
1368         int ret = 0;
1369
1370         struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
1371         if (!ca)
1372                 return -EIO;
1373
1374         struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
1375
1376         while (1) {
1377                 ret = commit_do(trans, NULL, NULL,
1378                                 BCH_TRANS_COMMIT_no_check_rw|
1379                                 BCH_TRANS_COMMIT_no_enospc,
1380                         ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
1381                 if (ret)
1382                         break;
1383                 if (bkey_eq(bp_pos, POS_MAX))
1384                         break;
1385
1386                 bp_pos = bpos_nosnap_successor(bp_pos);
1387         }
1388
1389         bch2_dev_put(ca);
1390         return ret;
1391 }
1392
1393 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1394 {
1395         struct btree_trans *trans = bch2_trans_get(c);
1396         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1397         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1398         int ret = 0;
1399
1400         ret = bch2_btree_write_buffer_flush_sync(trans);
1401         if (ret)
1402                 goto err;
1403
1404         for (i = 0; i < nr_data; i++) {
1405                 ret = ec_stripe_update_bucket(trans, s, i);
1406                 if (ret)
1407                         break;
1408         }
1409 err:
1410         bch2_trans_put(trans);
1411
1412         return ret;
1413 }
1414
1415 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1416                                        struct ec_stripe_new *s,
1417                                        unsigned block,
1418                                        struct open_bucket *ob)
1419 {
1420         struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
1421         if (!ca) {
1422                 s->err = -BCH_ERR_erofs_no_writes;
1423                 return;
1424         }
1425
1426         unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1427         memset(s->new_stripe.data[block] + (offset << 9),
1428                0,
1429                ob->sectors_free << 9);
1430
1431         int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1432                         ob->bucket * ca->mi.bucket_size + offset,
1433                         ob->sectors_free,
1434                         GFP_KERNEL, 0);
1435
1436         percpu_ref_put(&ca->io_ref);
1437
1438         if (ret)
1439                 s->err = ret;
1440 }
1441
1442 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1443 {
1444         if (s->idx)
1445                 bch2_stripe_close(c, s);
1446         kfree(s);
1447 }
1448
1449 /*
1450  * data buckets of new stripe all written: create the stripe
1451  */
1452 static void ec_stripe_create(struct ec_stripe_new *s)
1453 {
1454         struct bch_fs *c = s->c;
1455         struct open_bucket *ob;
1456         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1457         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1458         int ret;
1459
1460         BUG_ON(s->h->s == s);
1461
1462         closure_sync(&s->iodone);
1463
1464         if (!s->err) {
1465                 for (i = 0; i < nr_data; i++)
1466                         if (s->blocks[i]) {
1467                                 ob = c->open_buckets + s->blocks[i];
1468
1469                                 if (ob->sectors_free)
1470                                         zero_out_rest_of_ec_bucket(c, s, i, ob);
1471                         }
1472         }
1473
1474         if (s->err) {
1475                 if (!bch2_err_matches(s->err, EROFS))
1476                         bch_err(c, "error creating stripe: error writing data buckets");
1477                 goto err;
1478         }
1479
1480         if (s->have_existing_stripe) {
1481                 ec_validate_checksums(c, &s->existing_stripe);
1482
1483                 if (ec_do_recov(c, &s->existing_stripe)) {
1484                         bch_err(c, "error creating stripe: error reading existing stripe");
1485                         goto err;
1486                 }
1487
1488                 for (i = 0; i < nr_data; i++)
1489                         if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1490                                 swap(s->new_stripe.data[i],
1491                                      s->existing_stripe.data[i]);
1492
1493                 ec_stripe_buf_exit(&s->existing_stripe);
1494         }
1495
1496         BUG_ON(!s->allocated);
1497         BUG_ON(!s->idx);
1498
1499         ec_generate_ec(&s->new_stripe);
1500
1501         ec_generate_checksums(&s->new_stripe);
1502
1503         /* write p/q: */
1504         for (i = nr_data; i < v->nr_blocks; i++)
1505                 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1506         closure_sync(&s->iodone);
1507
1508         if (ec_nr_failed(&s->new_stripe)) {
1509                 bch_err(c, "error creating stripe: error writing redundancy buckets");
1510                 goto err;
1511         }
1512
1513         ret = bch2_trans_commit_do(c, &s->res, NULL,
1514                 BCH_TRANS_COMMIT_no_check_rw|
1515                 BCH_TRANS_COMMIT_no_enospc,
1516                 ec_stripe_key_update(trans,
1517                                      s->have_existing_stripe
1518                                      ? bkey_i_to_stripe(&s->existing_stripe.key)
1519                                      : NULL,
1520                                      bkey_i_to_stripe(&s->new_stripe.key)));
1521         bch_err_msg(c, ret, "creating stripe key");
1522         if (ret) {
1523                 goto err;
1524         }
1525
1526         ret = ec_stripe_update_extents(c, &s->new_stripe);
1527         bch_err_msg(c, ret, "error updating extents");
1528         if (ret)
1529                 goto err;
1530 err:
1531         bch2_disk_reservation_put(c, &s->res);
1532
1533         for (i = 0; i < v->nr_blocks; i++)
1534                 if (s->blocks[i]) {
1535                         ob = c->open_buckets + s->blocks[i];
1536
1537                         if (i < nr_data) {
1538                                 ob->ec = NULL;
1539                                 __bch2_open_bucket_put(c, ob);
1540                         } else {
1541                                 bch2_open_bucket_put(c, ob);
1542                         }
1543                 }
1544
1545         mutex_lock(&c->ec_stripe_new_lock);
1546         list_del(&s->list);
1547         mutex_unlock(&c->ec_stripe_new_lock);
1548         wake_up(&c->ec_stripe_new_wait);
1549
1550         ec_stripe_buf_exit(&s->existing_stripe);
1551         ec_stripe_buf_exit(&s->new_stripe);
1552         closure_debug_destroy(&s->iodone);
1553
1554         ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1555 }
1556
1557 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1558 {
1559         struct ec_stripe_new *s;
1560
1561         mutex_lock(&c->ec_stripe_new_lock);
1562         list_for_each_entry(s, &c->ec_stripe_new_list, list)
1563                 if (!atomic_read(&s->ref[STRIPE_REF_io]))
1564                         goto out;
1565         s = NULL;
1566 out:
1567         mutex_unlock(&c->ec_stripe_new_lock);
1568
1569         return s;
1570 }
1571
1572 static void ec_stripe_create_work(struct work_struct *work)
1573 {
1574         struct bch_fs *c = container_of(work,
1575                 struct bch_fs, ec_stripe_create_work);
1576         struct ec_stripe_new *s;
1577
1578         while ((s = get_pending_stripe(c)))
1579                 ec_stripe_create(s);
1580
1581         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1582 }
1583
1584 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1585 {
1586         bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1587
1588         if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1589                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1590 }
1591
1592 static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1593 {
1594         struct ec_stripe_new *s = h->s;
1595
1596         lockdep_assert_held(&h->lock);
1597
1598         BUG_ON(!s->allocated && !s->err);
1599
1600         h->s            = NULL;
1601         s->pending      = true;
1602
1603         mutex_lock(&c->ec_stripe_new_lock);
1604         list_add(&s->list, &c->ec_stripe_new_list);
1605         mutex_unlock(&c->ec_stripe_new_lock);
1606
1607         ec_stripe_new_put(c, s, STRIPE_REF_io);
1608 }
1609
1610 static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
1611 {
1612         h->s->err = err;
1613         ec_stripe_new_set_pending(c, h);
1614 }
1615
1616 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1617 {
1618         struct ec_stripe_new *s = ob->ec;
1619
1620         s->err = -EIO;
1621 }
1622
1623 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1624 {
1625         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1626         if (!ob)
1627                 return NULL;
1628
1629         BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1630
1631         struct bch_dev *ca      = ob_dev(c, ob);
1632         unsigned offset         = ca->mi.bucket_size - ob->sectors_free;
1633
1634         return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1635 }
1636
1637 static int unsigned_cmp(const void *_l, const void *_r)
1638 {
1639         unsigned l = *((const unsigned *) _l);
1640         unsigned r = *((const unsigned *) _r);
1641
1642         return cmp_int(l, r);
1643 }
1644
1645 /* pick most common bucket size: */
1646 static unsigned pick_blocksize(struct bch_fs *c,
1647                                struct bch_devs_mask *devs)
1648 {
1649         unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1650         struct {
1651                 unsigned nr, size;
1652         } cur = { 0, 0 }, best = { 0, 0 };
1653
1654         for_each_member_device_rcu(c, ca, devs)
1655                 sizes[nr++] = ca->mi.bucket_size;
1656
1657         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1658
1659         for (unsigned i = 0; i < nr; i++) {
1660                 if (sizes[i] != cur.size) {
1661                         if (cur.nr > best.nr)
1662                                 best = cur;
1663
1664                         cur.nr = 0;
1665                         cur.size = sizes[i];
1666                 }
1667
1668                 cur.nr++;
1669         }
1670
1671         if (cur.nr > best.nr)
1672                 best = cur;
1673
1674         return best.size;
1675 }
1676
1677 static bool may_create_new_stripe(struct bch_fs *c)
1678 {
1679         return false;
1680 }
1681
1682 static void ec_stripe_key_init(struct bch_fs *c,
1683                                struct bkey_i *k,
1684                                unsigned nr_data,
1685                                unsigned nr_parity,
1686                                unsigned stripe_size,
1687                                unsigned disk_label)
1688 {
1689         struct bkey_i_stripe *s = bkey_stripe_init(k);
1690         unsigned u64s;
1691
1692         s->v.sectors                    = cpu_to_le16(stripe_size);
1693         s->v.algorithm                  = 0;
1694         s->v.nr_blocks                  = nr_data + nr_parity;
1695         s->v.nr_redundant               = nr_parity;
1696         s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
1697         s->v.csum_type                  = BCH_CSUM_crc32c;
1698         s->v.disk_label                 = disk_label;
1699
1700         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1701                 BUG_ON(1 << s->v.csum_granularity_bits >=
1702                        le16_to_cpu(s->v.sectors) ||
1703                        s->v.csum_granularity_bits == U8_MAX);
1704                 s->v.csum_granularity_bits++;
1705         }
1706
1707         set_bkey_val_u64s(&s->k, u64s);
1708 }
1709
1710 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1711 {
1712         struct ec_stripe_new *s;
1713
1714         lockdep_assert_held(&h->lock);
1715
1716         s = kzalloc(sizeof(*s), GFP_KERNEL);
1717         if (!s)
1718                 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1719
1720         mutex_init(&s->lock);
1721         closure_init(&s->iodone, NULL);
1722         atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1723         atomic_set(&s->ref[STRIPE_REF_io], 1);
1724         s->c            = c;
1725         s->h            = h;
1726         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1727                                 BCH_BKEY_PTRS_MAX) - h->redundancy;
1728         s->nr_parity    = h->redundancy;
1729
1730         ec_stripe_key_init(c, &s->new_stripe.key,
1731                            s->nr_data, s->nr_parity,
1732                            h->blocksize, h->disk_label);
1733
1734         h->s = s;
1735         h->nr_created++;
1736         return 0;
1737 }
1738
1739 static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
1740 {
1741         struct bch_devs_mask devs = h->devs;
1742
1743         rcu_read_lock();
1744         h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
1745                                  ? group_to_target(h->disk_label - 1)
1746                                  : 0);
1747         unsigned nr_devs = dev_mask_nr(&h->devs);
1748
1749         for_each_member_device_rcu(c, ca, &h->devs)
1750                 if (!ca->mi.durability)
1751                         __clear_bit(ca->dev_idx, h->devs.d);
1752         unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
1753
1754         h->blocksize = pick_blocksize(c, &h->devs);
1755
1756         h->nr_active_devs = 0;
1757         for_each_member_device_rcu(c, ca, &h->devs)
1758                 if (ca->mi.bucket_size == h->blocksize)
1759                         h->nr_active_devs++;
1760
1761         rcu_read_unlock();
1762
1763         /*
1764          * If we only have redundancy + 1 devices, we're better off with just
1765          * replication:
1766          */
1767         h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
1768
1769         if (h->insufficient_devs) {
1770                 const char *err;
1771
1772                 if (nr_devs < h->redundancy + 2)
1773                         err = NULL;
1774                 else if (nr_devs_with_durability < h->redundancy + 2)
1775                         err = "cannot use durability=0 devices";
1776                 else
1777                         err = "mismatched bucket sizes";
1778
1779                 if (err)
1780                         bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
1781                                 h->nr_active_devs, h->redundancy + 2, err);
1782         }
1783
1784         struct bch_devs_mask devs_leaving;
1785         bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
1786
1787         if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
1788                 ec_stripe_new_cancel(c, h, -EINTR);
1789
1790         h->rw_devs_change_count = c->rw_devs_change_count;
1791 }
1792
1793 static struct ec_stripe_head *
1794 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
1795                          unsigned algo, unsigned redundancy,
1796                          enum bch_watermark watermark)
1797 {
1798         struct ec_stripe_head *h;
1799
1800         h = kzalloc(sizeof(*h), GFP_KERNEL);
1801         if (!h)
1802                 return NULL;
1803
1804         mutex_init(&h->lock);
1805         BUG_ON(!mutex_trylock(&h->lock));
1806
1807         h->disk_label   = disk_label;
1808         h->algo         = algo;
1809         h->redundancy   = redundancy;
1810         h->watermark    = watermark;
1811
1812         list_add(&h->list, &c->ec_stripe_head_list);
1813         return h;
1814 }
1815
1816 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1817 {
1818         if (h->s &&
1819             h->s->allocated &&
1820             bitmap_weight(h->s->blocks_allocated,
1821                           h->s->nr_data) == h->s->nr_data)
1822                 ec_stripe_new_set_pending(c, h);
1823
1824         mutex_unlock(&h->lock);
1825 }
1826
1827 static struct ec_stripe_head *
1828 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1829                           unsigned disk_label,
1830                           unsigned algo,
1831                           unsigned redundancy,
1832                           enum bch_watermark watermark)
1833 {
1834         struct bch_fs *c = trans->c;
1835         struct ec_stripe_head *h;
1836         int ret;
1837
1838         if (!redundancy)
1839                 return NULL;
1840
1841         ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1842         if (ret)
1843                 return ERR_PTR(ret);
1844
1845         if (test_bit(BCH_FS_going_ro, &c->flags)) {
1846                 h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1847                 goto err;
1848         }
1849
1850         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1851                 if (h->disk_label       == disk_label &&
1852                     h->algo             == algo &&
1853                     h->redundancy       == redundancy &&
1854                     h->watermark        == watermark) {
1855                         ret = bch2_trans_mutex_lock(trans, &h->lock);
1856                         if (ret) {
1857                                 h = ERR_PTR(ret);
1858                                 goto err;
1859                         }
1860                         goto found;
1861                 }
1862
1863         h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
1864         if (!h) {
1865                 h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
1866                 goto err;
1867         }
1868 found:
1869         if (h->rw_devs_change_count != c->rw_devs_change_count)
1870                 ec_stripe_head_devs_update(c, h);
1871
1872         if (h->insufficient_devs) {
1873                 mutex_unlock(&h->lock);
1874                 h = NULL;
1875         }
1876 err:
1877         mutex_unlock(&c->ec_stripe_head_lock);
1878         return h;
1879 }
1880
1881 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1882                                     enum bch_watermark watermark, struct closure *cl)
1883 {
1884         struct bch_fs *c = trans->c;
1885         struct bch_devs_mask devs = h->devs;
1886         struct open_bucket *ob;
1887         struct open_buckets buckets;
1888         struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1889         unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1890         bool have_cache = true;
1891         int ret = 0;
1892
1893         BUG_ON(v->nr_blocks     != h->s->nr_data + h->s->nr_parity);
1894         BUG_ON(v->nr_redundant  != h->s->nr_parity);
1895
1896         /* * We bypass the sector allocator which normally does this: */
1897         bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
1898
1899         for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1900                 /*
1901                  * Note: we don't yet repair invalid blocks (failed/removed
1902                  * devices) when reusing stripes - we still need a codepath to
1903                  * walk backpointers and update all extents that point to that
1904                  * block when updating the stripe
1905                  */
1906                 if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
1907                         __clear_bit(v->ptrs[i].dev, devs.d);
1908
1909                 if (i < h->s->nr_data)
1910                         nr_have_data++;
1911                 else
1912                         nr_have_parity++;
1913         }
1914
1915         BUG_ON(nr_have_data     > h->s->nr_data);
1916         BUG_ON(nr_have_parity   > h->s->nr_parity);
1917
1918         buckets.nr = 0;
1919         if (nr_have_parity < h->s->nr_parity) {
1920                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1921                                             &h->parity_stripe,
1922                                             &devs,
1923                                             h->s->nr_parity,
1924                                             &nr_have_parity,
1925                                             &have_cache, 0,
1926                                             BCH_DATA_parity,
1927                                             watermark,
1928                                             cl);
1929
1930                 open_bucket_for_each(c, &buckets, ob, i) {
1931                         j = find_next_zero_bit(h->s->blocks_gotten,
1932                                                h->s->nr_data + h->s->nr_parity,
1933                                                h->s->nr_data);
1934                         BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1935
1936                         h->s->blocks[j] = buckets.v[i];
1937                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1938                         __set_bit(j, h->s->blocks_gotten);
1939                 }
1940
1941                 if (ret)
1942                         return ret;
1943         }
1944
1945         buckets.nr = 0;
1946         if (nr_have_data < h->s->nr_data) {
1947                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1948                                             &h->block_stripe,
1949                                             &devs,
1950                                             h->s->nr_data,
1951                                             &nr_have_data,
1952                                             &have_cache, 0,
1953                                             BCH_DATA_user,
1954                                             watermark,
1955                                             cl);
1956
1957                 open_bucket_for_each(c, &buckets, ob, i) {
1958                         j = find_next_zero_bit(h->s->blocks_gotten,
1959                                                h->s->nr_data, 0);
1960                         BUG_ON(j >= h->s->nr_data);
1961
1962                         h->s->blocks[j] = buckets.v[i];
1963                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1964                         __set_bit(j, h->s->blocks_gotten);
1965                 }
1966
1967                 if (ret)
1968                         return ret;
1969         }
1970
1971         return 0;
1972 }
1973
1974 static s64 get_existing_stripe(struct bch_fs *c,
1975                                struct ec_stripe_head *head)
1976 {
1977         ec_stripes_heap *h = &c->ec_stripes_heap;
1978         struct stripe *m;
1979         size_t heap_idx;
1980         u64 stripe_idx;
1981         s64 ret = -1;
1982
1983         if (may_create_new_stripe(c))
1984                 return -1;
1985
1986         mutex_lock(&c->ec_stripes_heap_lock);
1987         for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
1988                 /* No blocks worth reusing, stripe will just be deleted: */
1989                 if (!h->data[heap_idx].blocks_nonempty)
1990                         continue;
1991
1992                 stripe_idx = h->data[heap_idx].idx;
1993
1994                 m = genradix_ptr(&c->stripes, stripe_idx);
1995
1996                 if (m->disk_label       == head->disk_label &&
1997                     m->algorithm        == head->algo &&
1998                     m->nr_redundant     == head->redundancy &&
1999                     m->sectors          == head->blocksize &&
2000                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant &&
2001                     bch2_try_open_stripe(c, head->s, stripe_idx)) {
2002                         ret = stripe_idx;
2003                         break;
2004                 }
2005         }
2006         mutex_unlock(&c->ec_stripes_heap_lock);
2007         return ret;
2008 }
2009
2010 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
2011 {
2012         struct bch_fs *c = trans->c;
2013         struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
2014         struct bch_stripe *existing_v;
2015         unsigned i;
2016         s64 idx;
2017         int ret;
2018
2019         /*
2020          * If we can't allocate a new stripe, and there's no stripes with empty
2021          * blocks for us to reuse, that means we have to wait on copygc:
2022          */
2023         idx = get_existing_stripe(c, h);
2024         if (idx < 0)
2025                 return -BCH_ERR_stripe_alloc_blocked;
2026
2027         ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
2028         bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
2029                              "reading stripe key: %s", bch2_err_str(ret));
2030         if (ret) {
2031                 bch2_stripe_close(c, h->s);
2032                 return ret;
2033         }
2034
2035         existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
2036
2037         BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
2038         h->s->nr_data = existing_v->nr_blocks -
2039                 existing_v->nr_redundant;
2040
2041         ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
2042         if (ret) {
2043                 bch2_stripe_close(c, h->s);
2044                 return ret;
2045         }
2046
2047         BUG_ON(h->s->existing_stripe.size != h->blocksize);
2048         BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
2049
2050         /*
2051          * Free buckets we initially allocated - they might conflict with
2052          * blocks from the stripe we're reusing:
2053          */
2054         for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
2055                 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
2056                 h->s->blocks[i] = 0;
2057         }
2058         memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
2059         memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
2060
2061         for (i = 0; i < existing_v->nr_blocks; i++) {
2062                 if (stripe_blockcount_get(existing_v, i)) {
2063                         __set_bit(i, h->s->blocks_gotten);
2064                         __set_bit(i, h->s->blocks_allocated);
2065                 }
2066
2067                 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
2068         }
2069
2070         bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
2071         h->s->have_existing_stripe = true;
2072
2073         return 0;
2074 }
2075
2076 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
2077 {
2078         struct bch_fs *c = trans->c;
2079         struct btree_iter iter;
2080         struct bkey_s_c k;
2081         struct bpos min_pos = POS(0, 1);
2082         struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
2083         int ret;
2084
2085         if (!h->s->res.sectors) {
2086                 ret = bch2_disk_reservation_get(c, &h->s->res,
2087                                         h->blocksize,
2088                                         h->s->nr_parity,
2089                                         BCH_DISK_RESERVATION_NOFAIL);
2090                 if (ret)
2091                         return ret;
2092         }
2093
2094         for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
2095                            BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
2096                 if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
2097                         if (start_pos.offset) {
2098                                 start_pos = min_pos;
2099                                 bch2_btree_iter_set_pos(&iter, start_pos);
2100                                 continue;
2101                         }
2102
2103                         ret = -BCH_ERR_ENOSPC_stripe_create;
2104                         break;
2105                 }
2106
2107                 if (bkey_deleted(k.k) &&
2108                     bch2_try_open_stripe(c, h->s, k.k->p.offset))
2109                         break;
2110         }
2111
2112         c->ec_stripe_hint = iter.pos.offset;
2113
2114         if (ret)
2115                 goto err;
2116
2117         ret = ec_stripe_mem_alloc(trans, &iter);
2118         if (ret) {
2119                 bch2_stripe_close(c, h->s);
2120                 goto err;
2121         }
2122
2123         h->s->new_stripe.key.k.p = iter.pos;
2124 out:
2125         bch2_trans_iter_exit(trans, &iter);
2126         return ret;
2127 err:
2128         bch2_disk_reservation_put(c, &h->s->res);
2129         goto out;
2130 }
2131
2132 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
2133                                                unsigned target,
2134                                                unsigned algo,
2135                                                unsigned redundancy,
2136                                                enum bch_watermark watermark,
2137                                                struct closure *cl)
2138 {
2139         struct bch_fs *c = trans->c;
2140         struct ec_stripe_head *h;
2141         bool waiting = false;
2142         unsigned disk_label = 0;
2143         struct target t = target_decode(target);
2144         int ret;
2145
2146         if (t.type == TARGET_GROUP) {
2147                 if (t.group > U8_MAX) {
2148                         bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
2149                         return NULL;
2150                 }
2151                 disk_label = t.group + 1; /* 0 == no label */
2152         }
2153
2154         h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
2155         if (IS_ERR_OR_NULL(h))
2156                 return h;
2157
2158         if (!h->s) {
2159                 ret = ec_new_stripe_alloc(c, h);
2160                 if (ret) {
2161                         bch_err(c, "failed to allocate new stripe");
2162                         goto err;
2163                 }
2164         }
2165
2166         if (h->s->allocated)
2167                 goto allocated;
2168
2169         if (h->s->have_existing_stripe)
2170                 goto alloc_existing;
2171
2172         /* First, try to allocate a full stripe: */
2173         ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
2174                 __bch2_ec_stripe_head_reserve(trans, h);
2175         if (!ret)
2176                 goto allocate_buf;
2177         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2178             bch2_err_matches(ret, ENOMEM))
2179                 goto err;
2180
2181         /*
2182          * Not enough buckets available for a full stripe: we must reuse an
2183          * existing stripe:
2184          */
2185         while (1) {
2186                 ret = __bch2_ec_stripe_head_reuse(trans, h);
2187                 if (!ret)
2188                         break;
2189                 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2190                         goto err;
2191
2192                 if (watermark == BCH_WATERMARK_copygc) {
2193                         ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
2194                                 __bch2_ec_stripe_head_reserve(trans, h);
2195                         if (ret)
2196                                 goto err;
2197                         goto allocate_buf;
2198                 }
2199
2200                 /* XXX freelist_wait? */
2201                 closure_wait(&c->freelist_wait, cl);
2202                 waiting = true;
2203         }
2204
2205         if (waiting)
2206                 closure_wake_up(&c->freelist_wait);
2207 alloc_existing:
2208         /*
2209          * Retry allocating buckets, with the watermark for this
2210          * particular write:
2211          */
2212         ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
2213         if (ret)
2214                 goto err;
2215
2216 allocate_buf:
2217         ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
2218         if (ret)
2219                 goto err;
2220
2221         h->s->allocated = true;
2222 allocated:
2223         BUG_ON(!h->s->idx);
2224         BUG_ON(!h->s->new_stripe.data[0]);
2225         BUG_ON(trans->restarted);
2226         return h;
2227 err:
2228         bch2_ec_stripe_head_put(c, h);
2229         return ERR_PTR(ret);
2230 }
2231
2232 /* device removal */
2233
2234 static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
2235 {
2236         struct bch_alloc_v4 a_convert;
2237         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
2238
2239         if (!a->stripe)
2240                 return 0;
2241
2242         if (a->stripe_sectors) {
2243                 bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
2244                 return -BCH_ERR_invalidate_stripe_to_dev;
2245         }
2246
2247         struct btree_iter iter;
2248         struct bkey_i_stripe *s =
2249                 bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
2250                                         BTREE_ITER_slots, stripe);
2251         int ret = PTR_ERR_OR_ZERO(s);
2252         if (ret)
2253                 return ret;
2254
2255         struct disk_accounting_pos acc = {
2256                 .type = BCH_DISK_ACCOUNTING_replicas,
2257         };
2258
2259         s64 sectors = 0;
2260         for (unsigned i = 0; i < s->v.nr_blocks; i++)
2261                 sectors -= stripe_blockcount_get(&s->v, i);
2262
2263         bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
2264         acc.replicas.data_type = BCH_DATA_user;
2265         ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
2266         if (ret)
2267                 goto err;
2268
2269         struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
2270         bkey_for_each_ptr(ptrs, ptr)
2271                 if (ptr->dev == k_a.k->p.inode)
2272                         ptr->dev = BCH_SB_MEMBER_INVALID;
2273
2274         sectors = -sectors;
2275
2276         bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
2277         acc.replicas.data_type = BCH_DATA_user;
2278         ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
2279         if (ret)
2280                 goto err;
2281 err:
2282         bch2_trans_iter_exit(trans, &iter);
2283         return ret;
2284 }
2285
2286 int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
2287 {
2288         return bch2_trans_run(c,
2289                 for_each_btree_key_upto_commit(trans, iter,
2290                                   BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
2291                                   BTREE_ITER_intent, k,
2292                                   NULL, NULL, 0, ({
2293                         bch2_invalidate_stripe_to_dev(trans, k);
2294         })));
2295 }
2296
2297 /* startup/shutdown */
2298
2299 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2300 {
2301         struct ec_stripe_head *h;
2302         struct open_bucket *ob;
2303         unsigned i;
2304
2305         mutex_lock(&c->ec_stripe_head_lock);
2306         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2307                 mutex_lock(&h->lock);
2308                 if (!h->s)
2309                         goto unlock;
2310
2311                 if (!ca)
2312                         goto found;
2313
2314                 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2315                         if (!h->s->blocks[i])
2316                                 continue;
2317
2318                         ob = c->open_buckets + h->s->blocks[i];
2319                         if (ob->dev == ca->dev_idx)
2320                                 goto found;
2321                 }
2322                 goto unlock;
2323 found:
2324                 ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
2325 unlock:
2326                 mutex_unlock(&h->lock);
2327         }
2328         mutex_unlock(&c->ec_stripe_head_lock);
2329 }
2330
2331 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2332 {
2333         __bch2_ec_stop(c, ca);
2334 }
2335
2336 void bch2_fs_ec_stop(struct bch_fs *c)
2337 {
2338         __bch2_ec_stop(c, NULL);
2339 }
2340
2341 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2342 {
2343         bool ret;
2344
2345         mutex_lock(&c->ec_stripe_new_lock);
2346         ret = list_empty(&c->ec_stripe_new_list);
2347         mutex_unlock(&c->ec_stripe_new_lock);
2348
2349         return ret;
2350 }
2351
2352 void bch2_fs_ec_flush(struct bch_fs *c)
2353 {
2354         wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2355 }
2356
2357 int bch2_stripes_read(struct bch_fs *c)
2358 {
2359         int ret = bch2_trans_run(c,
2360                 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
2361                                    BTREE_ITER_prefetch, k, ({
2362                         if (k.k->type != KEY_TYPE_stripe)
2363                                 continue;
2364
2365                         ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
2366                         if (ret)
2367                                 break;
2368
2369                         struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
2370
2371                         stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
2372
2373                         bch2_stripes_heap_insert(c, m, k.k->p.offset);
2374                         0;
2375                 })));
2376         bch_err_fn(c, ret);
2377         return ret;
2378 }
2379
2380 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
2381 {
2382         ec_stripes_heap *h = &c->ec_stripes_heap;
2383         struct stripe *m;
2384         size_t i;
2385
2386         mutex_lock(&c->ec_stripes_heap_lock);
2387         for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
2388                 m = genradix_ptr(&c->stripes, h->data[i].idx);
2389
2390                 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
2391                        h->data[i].blocks_nonempty,
2392                        m->nr_blocks - m->nr_redundant,
2393                        m->nr_redundant);
2394                 if (bch2_stripe_is_open(c, h->data[i].idx))
2395                         prt_str(out, " open");
2396                 prt_newline(out);
2397         }
2398         mutex_unlock(&c->ec_stripes_heap_lock);
2399 }
2400
2401 static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
2402                                     struct ec_stripe_new *s)
2403 {
2404         prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
2405                    s->idx, s->nr_data, s->nr_parity,
2406                    bitmap_weight(s->blocks_allocated, s->nr_data),
2407                    atomic_read(&s->ref[STRIPE_REF_io]),
2408                    atomic_read(&s->ref[STRIPE_REF_stripe]),
2409                    bch2_watermarks[s->h->watermark]);
2410
2411         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
2412         unsigned i;
2413         for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
2414                 prt_printf(out, " %u", s->blocks[i]);
2415         prt_newline(out);
2416         bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
2417         prt_newline(out);
2418 }
2419
2420 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2421 {
2422         struct ec_stripe_head *h;
2423         struct ec_stripe_new *s;
2424
2425         mutex_lock(&c->ec_stripe_head_lock);
2426         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2427                 prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
2428                        h->disk_label, h->algo, h->redundancy,
2429                        bch2_watermarks[h->watermark],
2430                        h->nr_created);
2431
2432                 if (h->s)
2433                         bch2_new_stripe_to_text(out, c, h->s);
2434         }
2435         mutex_unlock(&c->ec_stripe_head_lock);
2436
2437         prt_printf(out, "in flight:\n");
2438
2439         mutex_lock(&c->ec_stripe_new_lock);
2440         list_for_each_entry(s, &c->ec_stripe_new_list, list)
2441                 bch2_new_stripe_to_text(out, c, s);
2442         mutex_unlock(&c->ec_stripe_new_lock);
2443 }
2444
2445 void bch2_fs_ec_exit(struct bch_fs *c)
2446 {
2447         struct ec_stripe_head *h;
2448         unsigned i;
2449
2450         while (1) {
2451                 mutex_lock(&c->ec_stripe_head_lock);
2452                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
2453                                              struct ec_stripe_head, list);
2454                 if (h)
2455                         list_del(&h->list);
2456                 mutex_unlock(&c->ec_stripe_head_lock);
2457                 if (!h)
2458                         break;
2459
2460                 if (h->s) {
2461                         for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2462                                 BUG_ON(h->s->blocks[i]);
2463
2464                         kfree(h->s);
2465                 }
2466                 kfree(h);
2467         }
2468
2469         BUG_ON(!list_empty(&c->ec_stripe_new_list));
2470
2471         free_heap(&c->ec_stripes_heap);
2472         genradix_free(&c->stripes);
2473         bioset_exit(&c->ec_bioset);
2474 }
2475
2476 void bch2_fs_ec_init_early(struct bch_fs *c)
2477 {
2478         spin_lock_init(&c->ec_stripes_new_lock);
2479         mutex_init(&c->ec_stripes_heap_lock);
2480
2481         INIT_LIST_HEAD(&c->ec_stripe_head_list);
2482         mutex_init(&c->ec_stripe_head_lock);
2483
2484         INIT_LIST_HEAD(&c->ec_stripe_new_list);
2485         mutex_init(&c->ec_stripe_new_lock);
2486         init_waitqueue_head(&c->ec_stripe_new_wait);
2487
2488         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2489         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2490 }
2491
2492 int bch2_fs_ec_init(struct bch_fs *c)
2493 {
2494         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2495                            BIOSET_NEED_BVECS);
2496 }
This page took 0.173619 seconds and 4 git commands to generate.