]> Git Repo - J-linux.git/blob - fs/bcachefs/move.c
Merge patch series "riscv: Extension parsing fixes"
[J-linux.git] / fs / bcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_io.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_write_buffer.h"
13 #include "compress.h"
14 #include "disk_groups.h"
15 #include "ec.h"
16 #include "errcode.h"
17 #include "error.h"
18 #include "inode.h"
19 #include "io_read.h"
20 #include "io_write.h"
21 #include "journal_reclaim.h"
22 #include "keylist.h"
23 #include "move.h"
24 #include "replicas.h"
25 #include "snapshot.h"
26 #include "super-io.h"
27 #include "trace.h"
28
29 #include <linux/ioprio.h>
30 #include <linux/kthread.h>
31
32 const char * const bch2_data_ops_strs[] = {
33 #define x(t, n, ...) [n] = #t,
34         BCH_DATA_OPS()
35 #undef x
36         NULL
37 };
38
39 static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
40                                           struct bch_io_opts *io_opts,
41                                           struct data_update_opts *data_opts)
42 {
43         printbuf_tabstop_push(out, 20);
44         prt_str(out, "rewrite ptrs:\t");
45         bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
46         prt_newline(out);
47
48         prt_str(out, "kill ptrs:\t");
49         bch2_prt_u64_base2(out, data_opts->kill_ptrs);
50         prt_newline(out);
51
52         prt_str(out, "target:\t");
53         bch2_target_to_text(out, c, data_opts->target);
54         prt_newline(out);
55
56         prt_str(out, "compression:\t");
57         bch2_compression_opt_to_text(out, background_compression(*io_opts));
58         prt_newline(out);
59
60         prt_str(out, "extra replicas:\t");
61         prt_u64(out, data_opts->extra_replicas);
62 }
63
64 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
65                                struct bch_io_opts *io_opts,
66                                struct data_update_opts *data_opts)
67 {
68         if (trace_move_extent_enabled()) {
69                 struct printbuf buf = PRINTBUF;
70
71                 bch2_bkey_val_to_text(&buf, c, k);
72                 prt_newline(&buf);
73                 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
74                 trace_move_extent(c, buf.buf);
75                 printbuf_exit(&buf);
76         }
77 }
78
79 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
80 {
81         if (trace_move_extent_read_enabled()) {
82                 struct printbuf buf = PRINTBUF;
83
84                 bch2_bkey_val_to_text(&buf, c, k);
85                 trace_move_extent_read(c, buf.buf);
86                 printbuf_exit(&buf);
87         }
88 }
89
90 struct moving_io {
91         struct list_head                read_list;
92         struct list_head                io_list;
93         struct move_bucket_in_flight    *b;
94         struct closure                  cl;
95         bool                            read_completed;
96
97         unsigned                        read_sectors;
98         unsigned                        write_sectors;
99
100         struct bch_read_bio             rbio;
101
102         struct data_update              write;
103         /* Must be last since it is variable size */
104         struct bio_vec                  bi_inline_vecs[];
105 };
106
107 static void move_free(struct moving_io *io)
108 {
109         struct moving_context *ctxt = io->write.ctxt;
110
111         if (io->b)
112                 atomic_dec(&io->b->count);
113
114         bch2_data_update_exit(&io->write);
115
116         mutex_lock(&ctxt->lock);
117         list_del(&io->io_list);
118         wake_up(&ctxt->wait);
119         mutex_unlock(&ctxt->lock);
120
121         kfree(io);
122 }
123
124 static void move_write_done(struct bch_write_op *op)
125 {
126         struct moving_io *io = container_of(op, struct moving_io, write.op);
127         struct moving_context *ctxt = io->write.ctxt;
128
129         if (io->write.op.error)
130                 ctxt->write_error = true;
131
132         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
133         atomic_dec(&io->write.ctxt->write_ios);
134         move_free(io);
135         closure_put(&ctxt->cl);
136 }
137
138 static void move_write(struct moving_io *io)
139 {
140         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
141                 move_free(io);
142                 return;
143         }
144
145         if (trace_move_extent_write_enabled()) {
146                 struct bch_fs *c = io->write.op.c;
147                 struct printbuf buf = PRINTBUF;
148
149                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
150                 trace_move_extent_write(c, buf.buf);
151                 printbuf_exit(&buf);
152         }
153
154         closure_get(&io->write.ctxt->cl);
155         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
156         atomic_inc(&io->write.ctxt->write_ios);
157
158         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
159 }
160
161 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
162 {
163         struct moving_io *io =
164                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
165
166         return io && io->read_completed ? io : NULL;
167 }
168
169 static void move_read_endio(struct bio *bio)
170 {
171         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
172         struct moving_context *ctxt = io->write.ctxt;
173
174         atomic_sub(io->read_sectors, &ctxt->read_sectors);
175         atomic_dec(&ctxt->read_ios);
176         io->read_completed = true;
177
178         wake_up(&ctxt->wait);
179         closure_put(&ctxt->cl);
180 }
181
182 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
183 {
184         struct moving_io *io;
185
186         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
187                 bch2_trans_unlock_long(ctxt->trans);
188                 list_del(&io->read_list);
189                 move_write(io);
190         }
191 }
192
193 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
194 {
195         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
196
197         move_ctxt_wait_event(ctxt,
198                 !atomic_read(&ctxt->write_sectors) ||
199                 atomic_read(&ctxt->write_sectors) != sectors_pending);
200 }
201
202 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
203 {
204         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
205         bch2_trans_unlock_long(ctxt->trans);
206         closure_sync(&ctxt->cl);
207 }
208
209 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
210 {
211         struct bch_fs *c = ctxt->trans->c;
212
213         bch2_moving_ctxt_flush_all(ctxt);
214
215         EBUG_ON(atomic_read(&ctxt->write_sectors));
216         EBUG_ON(atomic_read(&ctxt->write_ios));
217         EBUG_ON(atomic_read(&ctxt->read_sectors));
218         EBUG_ON(atomic_read(&ctxt->read_ios));
219
220         mutex_lock(&c->moving_context_lock);
221         list_del(&ctxt->list);
222         mutex_unlock(&c->moving_context_lock);
223
224         bch2_trans_put(ctxt->trans);
225         memset(ctxt, 0, sizeof(*ctxt));
226 }
227
228 void bch2_moving_ctxt_init(struct moving_context *ctxt,
229                            struct bch_fs *c,
230                            struct bch_ratelimit *rate,
231                            struct bch_move_stats *stats,
232                            struct write_point_specifier wp,
233                            bool wait_on_copygc)
234 {
235         memset(ctxt, 0, sizeof(*ctxt));
236
237         ctxt->trans     = bch2_trans_get(c);
238         ctxt->fn        = (void *) _RET_IP_;
239         ctxt->rate      = rate;
240         ctxt->stats     = stats;
241         ctxt->wp        = wp;
242         ctxt->wait_on_copygc = wait_on_copygc;
243
244         closure_init_stack(&ctxt->cl);
245
246         mutex_init(&ctxt->lock);
247         INIT_LIST_HEAD(&ctxt->reads);
248         INIT_LIST_HEAD(&ctxt->ios);
249         init_waitqueue_head(&ctxt->wait);
250
251         mutex_lock(&c->moving_context_lock);
252         list_add(&ctxt->list, &c->moving_context_list);
253         mutex_unlock(&c->moving_context_lock);
254 }
255
256 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
257 {
258         trace_move_data(c, stats);
259 }
260
261 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
262 {
263         memset(stats, 0, sizeof(*stats));
264         stats->data_type = BCH_DATA_user;
265         scnprintf(stats->name, sizeof(stats->name), "%s", name);
266 }
267
268 int bch2_move_extent(struct moving_context *ctxt,
269                      struct move_bucket_in_flight *bucket_in_flight,
270                      struct btree_iter *iter,
271                      struct bkey_s_c k,
272                      struct bch_io_opts io_opts,
273                      struct data_update_opts data_opts)
274 {
275         struct btree_trans *trans = ctxt->trans;
276         struct bch_fs *c = trans->c;
277         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
278         struct moving_io *io;
279         const union bch_extent_entry *entry;
280         struct extent_ptr_decoded p;
281         unsigned sectors = k.k->size, pages;
282         int ret = -ENOMEM;
283
284         trace_move_extent2(c, k, &io_opts, &data_opts);
285
286         if (ctxt->stats)
287                 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
288
289         bch2_data_update_opts_normalize(k, &data_opts);
290
291         if (!data_opts.rewrite_ptrs &&
292             !data_opts.extra_replicas) {
293                 if (data_opts.kill_ptrs)
294                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
295                 return 0;
296         }
297
298         /*
299          * Before memory allocations & taking nocow locks in
300          * bch2_data_update_init():
301          */
302         bch2_trans_unlock(trans);
303
304         /* write path might have to decompress data: */
305         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
306                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
307
308         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
309         io = kzalloc(sizeof(struct moving_io) +
310                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
311         if (!io)
312                 goto err;
313
314         INIT_LIST_HEAD(&io->io_list);
315         io->write.ctxt          = ctxt;
316         io->read_sectors        = k.k->size;
317         io->write_sectors       = k.k->size;
318
319         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
320         bio_set_prio(&io->write.op.wbio.bio,
321                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
322
323         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
324                                  GFP_KERNEL))
325                 goto err_free;
326
327         io->rbio.c              = c;
328         io->rbio.opts           = io_opts;
329         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
330         io->rbio.bio.bi_vcnt = pages;
331         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
332         io->rbio.bio.bi_iter.bi_size = sectors << 9;
333
334         io->rbio.bio.bi_opf             = REQ_OP_READ;
335         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
336         io->rbio.bio.bi_end_io          = move_read_endio;
337
338         ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
339                                     io_opts, data_opts, iter->btree_id, k);
340         if (ret)
341                 goto err_free_pages;
342
343         io->write.op.end_io = move_write_done;
344
345         if (ctxt->rate)
346                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
347
348         if (ctxt->stats) {
349                 atomic64_inc(&ctxt->stats->keys_moved);
350                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
351         }
352
353         if (bucket_in_flight) {
354                 io->b = bucket_in_flight;
355                 atomic_inc(&io->b->count);
356         }
357
358         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
359         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
360         trace_move_extent_read2(c, k);
361
362         mutex_lock(&ctxt->lock);
363         atomic_add(io->read_sectors, &ctxt->read_sectors);
364         atomic_inc(&ctxt->read_ios);
365
366         list_add_tail(&io->read_list, &ctxt->reads);
367         list_add_tail(&io->io_list, &ctxt->ios);
368         mutex_unlock(&ctxt->lock);
369
370         /*
371          * dropped by move_read_endio() - guards against use after free of
372          * ctxt when doing wakeup
373          */
374         closure_get(&ctxt->cl);
375         bch2_read_extent(trans, &io->rbio,
376                          bkey_start_pos(k.k),
377                          iter->btree_id, k, 0,
378                          BCH_READ_NODECODE|
379                          BCH_READ_LAST_FRAGMENT);
380         return 0;
381 err_free_pages:
382         bio_free_pages(&io->write.op.wbio.bio);
383 err_free:
384         kfree(io);
385 err:
386         if (ret == -BCH_ERR_data_update_done)
387                 return 0;
388
389         if (bch2_err_matches(ret, EROFS) ||
390             bch2_err_matches(ret, BCH_ERR_transaction_restart))
391                 return ret;
392
393         count_event(c, move_extent_start_fail);
394
395         if (trace_move_extent_start_fail_enabled()) {
396                 struct printbuf buf = PRINTBUF;
397
398                 bch2_bkey_val_to_text(&buf, c, k);
399                 prt_str(&buf, ": ");
400                 prt_str(&buf, bch2_err_str(ret));
401                 trace_move_extent_start_fail(c, buf.buf);
402                 printbuf_exit(&buf);
403         }
404         return ret;
405 }
406
407 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
408                           struct per_snapshot_io_opts *io_opts,
409                           struct bkey_s_c extent_k)
410 {
411         struct bch_fs *c = trans->c;
412         u32 restart_count = trans->restart_count;
413         int ret = 0;
414
415         if (io_opts->cur_inum != extent_k.k->p.inode) {
416                 io_opts->d.nr = 0;
417
418                 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
419                                          BTREE_ITER_all_snapshots, k, ({
420                         if (k.k->p.offset != extent_k.k->p.inode)
421                                 break;
422
423                         if (!bkey_is_inode(k.k))
424                                 continue;
425
426                         struct bch_inode_unpacked inode;
427                         BUG_ON(bch2_inode_unpack(k, &inode));
428
429                         struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
430                         bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
431
432                         darray_push(&io_opts->d, e);
433                 }));
434                 io_opts->cur_inum = extent_k.k->p.inode;
435         }
436
437         ret = ret ?: trans_was_restarted(trans, restart_count);
438         if (ret)
439                 return ERR_PTR(ret);
440
441         if (extent_k.k->p.snapshot)
442                 darray_for_each(io_opts->d, i)
443                         if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
444                                 return &i->io_opts;
445
446         return &io_opts->fs_io_opts;
447 }
448
449 int bch2_move_get_io_opts_one(struct btree_trans *trans,
450                               struct bch_io_opts *io_opts,
451                               struct bkey_s_c extent_k)
452 {
453         struct btree_iter iter;
454         struct bkey_s_c k;
455         int ret;
456
457         /* reflink btree? */
458         if (!extent_k.k->p.inode) {
459                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
460                 return 0;
461         }
462
463         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
464                                SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
465                                BTREE_ITER_cached);
466         ret = bkey_err(k);
467         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
468                 return ret;
469
470         if (!ret && bkey_is_inode(k.k)) {
471                 struct bch_inode_unpacked inode;
472                 bch2_inode_unpack(k, &inode);
473                 bch2_inode_opts_get(io_opts, trans->c, &inode);
474         } else {
475                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
476         }
477
478         bch2_trans_iter_exit(trans, &iter);
479         return 0;
480 }
481
482 int bch2_move_ratelimit(struct moving_context *ctxt)
483 {
484         struct bch_fs *c = ctxt->trans->c;
485         bool is_kthread = current->flags & PF_KTHREAD;
486         u64 delay;
487
488         if (ctxt->wait_on_copygc && c->copygc_running) {
489                 bch2_moving_ctxt_flush_all(ctxt);
490                 wait_event_killable(c->copygc_running_wq,
491                                     !c->copygc_running ||
492                                     (is_kthread && kthread_should_stop()));
493         }
494
495         do {
496                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
497
498                 if (is_kthread && kthread_should_stop())
499                         return 1;
500
501                 if (delay)
502                         move_ctxt_wait_event_timeout(ctxt,
503                                         freezing(current) ||
504                                         (is_kthread && kthread_should_stop()),
505                                         delay);
506
507                 if (unlikely(freezing(current))) {
508                         bch2_moving_ctxt_flush_all(ctxt);
509                         try_to_freeze();
510                 }
511         } while (delay);
512
513         /*
514          * XXX: these limits really ought to be per device, SSDs and hard drives
515          * will want different limits
516          */
517         move_ctxt_wait_event(ctxt,
518                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
519                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
520                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
521                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
522
523         return 0;
524 }
525
526 static int bch2_move_data_btree(struct moving_context *ctxt,
527                                 struct bpos start,
528                                 struct bpos end,
529                                 move_pred_fn pred, void *arg,
530                                 enum btree_id btree_id)
531 {
532         struct btree_trans *trans = ctxt->trans;
533         struct bch_fs *c = trans->c;
534         struct per_snapshot_io_opts snapshot_io_opts;
535         struct bch_io_opts *io_opts;
536         struct bkey_buf sk;
537         struct btree_iter iter;
538         struct bkey_s_c k;
539         struct data_update_opts data_opts;
540         int ret = 0, ret2;
541
542         per_snapshot_io_opts_init(&snapshot_io_opts, c);
543         bch2_bkey_buf_init(&sk);
544
545         if (ctxt->stats) {
546                 ctxt->stats->data_type  = BCH_DATA_user;
547                 ctxt->stats->pos        = BBPOS(btree_id, start);
548         }
549
550         bch2_trans_iter_init(trans, &iter, btree_id, start,
551                              BTREE_ITER_prefetch|
552                              BTREE_ITER_all_snapshots);
553
554         if (ctxt->rate)
555                 bch2_ratelimit_reset(ctxt->rate);
556
557         while (!bch2_move_ratelimit(ctxt)) {
558                 bch2_trans_begin(trans);
559
560                 k = bch2_btree_iter_peek(&iter);
561                 if (!k.k)
562                         break;
563
564                 ret = bkey_err(k);
565                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
566                         continue;
567                 if (ret)
568                         break;
569
570                 if (bkey_ge(bkey_start_pos(k.k), end))
571                         break;
572
573                 if (ctxt->stats)
574                         ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
575
576                 if (!bkey_extent_is_direct_data(k.k))
577                         goto next_nondata;
578
579                 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
580                 ret = PTR_ERR_OR_ZERO(io_opts);
581                 if (ret)
582                         continue;
583
584                 memset(&data_opts, 0, sizeof(data_opts));
585                 if (!pred(c, arg, k, io_opts, &data_opts))
586                         goto next;
587
588                 /*
589                  * The iterator gets unlocked by __bch2_read_extent - need to
590                  * save a copy of @k elsewhere:
591                  */
592                 bch2_bkey_buf_reassemble(&sk, c, k);
593                 k = bkey_i_to_s_c(sk.k);
594
595                 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
596                 if (ret2) {
597                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
598                                 continue;
599
600                         if (ret2 == -ENOMEM) {
601                                 /* memory allocation failure, wait for some IO to finish */
602                                 bch2_move_ctxt_wait_for_io(ctxt);
603                                 continue;
604                         }
605
606                         /* XXX signal failure */
607                         goto next;
608                 }
609 next:
610                 if (ctxt->stats)
611                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
612 next_nondata:
613                 bch2_btree_iter_advance(&iter);
614         }
615
616         bch2_trans_iter_exit(trans, &iter);
617         bch2_bkey_buf_exit(&sk, c);
618         per_snapshot_io_opts_exit(&snapshot_io_opts);
619
620         return ret;
621 }
622
623 int __bch2_move_data(struct moving_context *ctxt,
624                      struct bbpos start,
625                      struct bbpos end,
626                      move_pred_fn pred, void *arg)
627 {
628         struct bch_fs *c = ctxt->trans->c;
629         enum btree_id id;
630         int ret = 0;
631
632         for (id = start.btree;
633              id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
634              id++) {
635                 ctxt->stats->pos = BBPOS(id, POS_MIN);
636
637                 if (!btree_type_has_ptrs(id) ||
638                     !bch2_btree_id_root(c, id)->b)
639                         continue;
640
641                 ret = bch2_move_data_btree(ctxt,
642                                        id == start.btree ? start.pos : POS_MIN,
643                                        id == end.btree   ? end.pos   : POS_MAX,
644                                        pred, arg, id);
645                 if (ret)
646                         break;
647         }
648
649         return ret;
650 }
651
652 int bch2_move_data(struct bch_fs *c,
653                    struct bbpos start,
654                    struct bbpos end,
655                    struct bch_ratelimit *rate,
656                    struct bch_move_stats *stats,
657                    struct write_point_specifier wp,
658                    bool wait_on_copygc,
659                    move_pred_fn pred, void *arg)
660 {
661
662         struct moving_context ctxt;
663         int ret;
664
665         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
666         ret = __bch2_move_data(&ctxt, start, end, pred, arg);
667         bch2_moving_ctxt_exit(&ctxt);
668
669         return ret;
670 }
671
672 int bch2_evacuate_bucket(struct moving_context *ctxt,
673                            struct move_bucket_in_flight *bucket_in_flight,
674                            struct bpos bucket, int gen,
675                            struct data_update_opts _data_opts)
676 {
677         struct btree_trans *trans = ctxt->trans;
678         struct bch_fs *c = trans->c;
679         bool is_kthread = current->flags & PF_KTHREAD;
680         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
681         struct btree_iter iter;
682         struct bkey_buf sk;
683         struct bch_backpointer bp;
684         struct bch_alloc_v4 a_convert;
685         const struct bch_alloc_v4 *a;
686         struct bkey_s_c k;
687         struct data_update_opts data_opts;
688         unsigned dirty_sectors, bucket_size;
689         u64 fragmentation;
690         struct bpos bp_pos = POS_MIN;
691         int ret = 0;
692
693         struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
694         if (!ca)
695                 return 0;
696
697         trace_bucket_evacuate(c, &bucket);
698
699         bch2_bkey_buf_init(&sk);
700
701         /*
702          * We're not run in a context that handles transaction restarts:
703          */
704         bch2_trans_begin(trans);
705
706         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
707                              bucket, BTREE_ITER_cached);
708         ret = lockrestart_do(trans,
709                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
710         bch2_trans_iter_exit(trans, &iter);
711
712         bch_err_msg(c, ret, "looking up alloc key");
713         if (ret)
714                 goto err;
715
716         a = bch2_alloc_to_v4(k, &a_convert);
717         dirty_sectors = bch2_bucket_sectors_dirty(*a);
718         bucket_size = ca->mi.bucket_size;
719         fragmentation = a->fragmentation_lru;
720
721         ret = bch2_btree_write_buffer_tryflush(trans);
722         bch_err_msg(c, ret, "flushing btree write buffer");
723         if (ret)
724                 goto err;
725
726         while (!(ret = bch2_move_ratelimit(ctxt))) {
727                 if (is_kthread && kthread_should_stop())
728                         break;
729
730                 bch2_trans_begin(trans);
731
732                 ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
733                                                 &bp_pos, &bp,
734                                                 BTREE_ITER_cached);
735                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
736                         continue;
737                 if (ret)
738                         goto err;
739                 if (bkey_eq(bp_pos, POS_MAX))
740                         break;
741
742                 if (!bp.level) {
743                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
744                         ret = bkey_err(k);
745                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
746                                 continue;
747                         if (ret)
748                                 goto err;
749                         if (!k.k)
750                                 goto next;
751
752                         bch2_bkey_buf_reassemble(&sk, c, k);
753                         k = bkey_i_to_s_c(sk.k);
754
755                         ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
756                         if (ret) {
757                                 bch2_trans_iter_exit(trans, &iter);
758                                 continue;
759                         }
760
761                         data_opts = _data_opts;
762                         data_opts.target        = io_opts.background_target;
763                         data_opts.rewrite_ptrs = 0;
764
765                         unsigned i = 0;
766                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
767                                 if (ptr->dev == bucket.inode) {
768                                         data_opts.rewrite_ptrs |= 1U << i;
769                                         if (ptr->cached) {
770                                                 bch2_trans_iter_exit(trans, &iter);
771                                                 goto next;
772                                         }
773                                 }
774                                 i++;
775                         }
776
777                         ret = bch2_move_extent(ctxt, bucket_in_flight,
778                                                &iter, k, io_opts, data_opts);
779                         bch2_trans_iter_exit(trans, &iter);
780
781                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
782                                 continue;
783                         if (ret == -ENOMEM) {
784                                 /* memory allocation failure, wait for some IO to finish */
785                                 bch2_move_ctxt_wait_for_io(ctxt);
786                                 continue;
787                         }
788                         if (ret)
789                                 goto err;
790
791                         if (ctxt->stats)
792                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
793                 } else {
794                         struct btree *b;
795
796                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
797                         ret = PTR_ERR_OR_ZERO(b);
798                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
799                                 continue;
800                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
801                                 continue;
802                         if (ret)
803                                 goto err;
804                         if (!b)
805                                 goto next;
806
807                         unsigned sectors = btree_ptr_sectors_written(&b->key);
808
809                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
810                         bch2_trans_iter_exit(trans, &iter);
811
812                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
813                                 continue;
814                         if (ret)
815                                 goto err;
816
817                         if (ctxt->rate)
818                                 bch2_ratelimit_increment(ctxt->rate, sectors);
819                         if (ctxt->stats) {
820                                 atomic64_add(sectors, &ctxt->stats->sectors_seen);
821                                 atomic64_add(sectors, &ctxt->stats->sectors_moved);
822                         }
823                 }
824 next:
825                 bp_pos = bpos_nosnap_successor(bp_pos);
826         }
827
828         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
829 err:
830         bch2_dev_put(ca);
831         bch2_bkey_buf_exit(&sk, c);
832         return ret;
833 }
834
835 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
836                                 struct btree *, struct bch_io_opts *,
837                                 struct data_update_opts *);
838
839 static int bch2_move_btree(struct bch_fs *c,
840                            struct bbpos start,
841                            struct bbpos end,
842                            move_btree_pred pred, void *arg,
843                            struct bch_move_stats *stats)
844 {
845         bool kthread = (current->flags & PF_KTHREAD) != 0;
846         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
847         struct moving_context ctxt;
848         struct btree_trans *trans;
849         struct btree_iter iter;
850         struct btree *b;
851         enum btree_id btree;
852         struct data_update_opts data_opts;
853         int ret = 0;
854
855         bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
856                               writepoint_ptr(&c->btree_write_point),
857                               true);
858         trans = ctxt.trans;
859
860         stats->data_type = BCH_DATA_btree;
861
862         for (btree = start.btree;
863              btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
864              btree ++) {
865                 stats->pos = BBPOS(btree, POS_MIN);
866
867                 if (!bch2_btree_id_root(c, btree)->b)
868                         continue;
869
870                 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
871                                           BTREE_ITER_prefetch);
872 retry:
873                 ret = 0;
874                 while (bch2_trans_begin(trans),
875                        (b = bch2_btree_iter_peek_node(&iter)) &&
876                        !(ret = PTR_ERR_OR_ZERO(b))) {
877                         if (kthread && kthread_should_stop())
878                                 break;
879
880                         if ((cmp_int(btree, end.btree) ?:
881                              bpos_cmp(b->key.k.p, end.pos)) > 0)
882                                 break;
883
884                         stats->pos = BBPOS(iter.btree_id, iter.pos);
885
886                         if (!pred(c, arg, b, &io_opts, &data_opts))
887                                 goto next;
888
889                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
890                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
891                                 continue;
892                         if (ret)
893                                 break;
894 next:
895                         bch2_btree_iter_next_node(&iter);
896                 }
897                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
898                         goto retry;
899
900                 bch2_trans_iter_exit(trans, &iter);
901
902                 if (kthread && kthread_should_stop())
903                         break;
904         }
905
906         bch_err_fn(c, ret);
907         bch2_moving_ctxt_exit(&ctxt);
908         bch2_btree_interior_updates_flush(c);
909
910         return ret;
911 }
912
913 static bool rereplicate_pred(struct bch_fs *c, void *arg,
914                              struct bkey_s_c k,
915                              struct bch_io_opts *io_opts,
916                              struct data_update_opts *data_opts)
917 {
918         unsigned nr_good = bch2_bkey_durability(c, k);
919         unsigned replicas = bkey_is_btree_ptr(k.k)
920                 ? c->opts.metadata_replicas
921                 : io_opts->data_replicas;
922
923         if (!nr_good || nr_good >= replicas)
924                 return false;
925
926         data_opts->target               = 0;
927         data_opts->extra_replicas       = replicas - nr_good;
928         data_opts->btree_insert_flags   = 0;
929         return true;
930 }
931
932 static bool migrate_pred(struct bch_fs *c, void *arg,
933                          struct bkey_s_c k,
934                          struct bch_io_opts *io_opts,
935                          struct data_update_opts *data_opts)
936 {
937         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
938         struct bch_ioctl_data *op = arg;
939         unsigned i = 0;
940
941         data_opts->rewrite_ptrs         = 0;
942         data_opts->target               = 0;
943         data_opts->extra_replicas       = 0;
944         data_opts->btree_insert_flags   = 0;
945
946         bkey_for_each_ptr(ptrs, ptr) {
947                 if (ptr->dev == op->migrate.dev)
948                         data_opts->rewrite_ptrs |= 1U << i;
949                 i++;
950         }
951
952         return data_opts->rewrite_ptrs != 0;
953 }
954
955 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
956                                    struct btree *b,
957                                    struct bch_io_opts *io_opts,
958                                    struct data_update_opts *data_opts)
959 {
960         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
961 }
962
963 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
964                                struct btree *b,
965                                struct bch_io_opts *io_opts,
966                                struct data_update_opts *data_opts)
967 {
968         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
969 }
970
971 /*
972  * Ancient versions of bcachefs produced packed formats which could represent
973  * keys that the in memory format cannot represent; this checks for those
974  * formats so we can get rid of them.
975  */
976 static bool bformat_needs_redo(struct bkey_format *f)
977 {
978         for (unsigned i = 0; i < f->nr_fields; i++)
979                 if (bch2_bkey_format_field_overflows(f, i))
980                         return true;
981
982         return false;
983 }
984
985 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
986                                    struct btree *b,
987                                    struct bch_io_opts *io_opts,
988                                    struct data_update_opts *data_opts)
989 {
990         if (b->version_ondisk != c->sb.version ||
991             btree_node_need_rewrite(b) ||
992             bformat_needs_redo(&b->format)) {
993                 data_opts->target               = 0;
994                 data_opts->extra_replicas       = 0;
995                 data_opts->btree_insert_flags   = 0;
996                 return true;
997         }
998
999         return false;
1000 }
1001
1002 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1003 {
1004         int ret;
1005
1006         ret = bch2_move_btree(c,
1007                               BBPOS_MIN,
1008                               BBPOS_MAX,
1009                               rewrite_old_nodes_pred, c, stats);
1010         if (!ret) {
1011                 mutex_lock(&c->sb_lock);
1012                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1013                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1014                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1015                 bch2_write_super(c);
1016                 mutex_unlock(&c->sb_lock);
1017         }
1018
1019         bch_err_fn(c, ret);
1020         return ret;
1021 }
1022
1023 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1024                              struct bkey_s_c k,
1025                              struct bch_io_opts *io_opts,
1026                              struct data_update_opts *data_opts)
1027 {
1028         unsigned durability = bch2_bkey_durability(c, k);
1029         unsigned replicas = bkey_is_btree_ptr(k.k)
1030                 ? c->opts.metadata_replicas
1031                 : io_opts->data_replicas;
1032         const union bch_extent_entry *entry;
1033         struct extent_ptr_decoded p;
1034         unsigned i = 0;
1035
1036         rcu_read_lock();
1037         bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1038                 unsigned d = bch2_extent_ptr_durability(c, &p);
1039
1040                 if (d && durability - d >= replicas) {
1041                         data_opts->kill_ptrs |= BIT(i);
1042                         durability -= d;
1043                 }
1044
1045                 i++;
1046         }
1047         rcu_read_unlock();
1048
1049         return data_opts->kill_ptrs != 0;
1050 }
1051
1052 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1053                                    struct btree *b,
1054                                    struct bch_io_opts *io_opts,
1055                                    struct data_update_opts *data_opts)
1056 {
1057         return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
1058 }
1059
1060 int bch2_data_job(struct bch_fs *c,
1061                   struct bch_move_stats *stats,
1062                   struct bch_ioctl_data op)
1063 {
1064         struct bbpos start      = BBPOS(op.start_btree, op.start_pos);
1065         struct bbpos end        = BBPOS(op.end_btree, op.end_pos);
1066         int ret = 0;
1067
1068         if (op.op >= BCH_DATA_OP_NR)
1069                 return -EINVAL;
1070
1071         bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
1072
1073         switch (op.op) {
1074         case BCH_DATA_OP_rereplicate:
1075                 stats->data_type = BCH_DATA_journal;
1076                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1077                 ret = bch2_move_btree(c, start, end,
1078                                       rereplicate_btree_pred, c, stats) ?: ret;
1079                 ret = bch2_move_data(c, start, end,
1080                                      NULL,
1081                                      stats,
1082                                      writepoint_hashed((unsigned long) current),
1083                                      true,
1084                                      rereplicate_pred, c) ?: ret;
1085                 ret = bch2_replicas_gc2(c) ?: ret;
1086                 break;
1087         case BCH_DATA_OP_migrate:
1088                 if (op.migrate.dev >= c->sb.nr_devices)
1089                         return -EINVAL;
1090
1091                 stats->data_type = BCH_DATA_journal;
1092                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1093                 ret = bch2_move_btree(c, start, end,
1094                                       migrate_btree_pred, &op, stats) ?: ret;
1095                 ret = bch2_move_data(c, start, end,
1096                                      NULL,
1097                                      stats,
1098                                      writepoint_hashed((unsigned long) current),
1099                                      true,
1100                                      migrate_pred, &op) ?: ret;
1101                 ret = bch2_replicas_gc2(c) ?: ret;
1102                 break;
1103         case BCH_DATA_OP_rewrite_old_nodes:
1104                 ret = bch2_scan_old_btree_nodes(c, stats);
1105                 break;
1106         case BCH_DATA_OP_drop_extra_replicas:
1107                 ret = bch2_move_btree(c, start, end,
1108                                 drop_extra_replicas_btree_pred, c, stats) ?: ret;
1109                 ret = bch2_move_data(c, start, end, NULL, stats,
1110                                 writepoint_hashed((unsigned long) current),
1111                                 true,
1112                                 drop_extra_replicas_pred, c) ?: ret;
1113                 ret = bch2_replicas_gc2(c) ?: ret;
1114                 break;
1115         default:
1116                 ret = -EINVAL;
1117         }
1118
1119         bch2_move_stats_exit(stats, c);
1120         return ret;
1121 }
1122
1123 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1124 {
1125         prt_printf(out, "%s: data type==", stats->name);
1126         bch2_prt_data_type(out, stats->data_type);
1127         prt_str(out, " pos=");
1128         bch2_bbpos_to_text(out, stats->pos);
1129         prt_newline(out);
1130         printbuf_indent_add(out, 2);
1131
1132         prt_printf(out, "keys moved:  %llu\n",  atomic64_read(&stats->keys_moved));
1133         prt_printf(out, "keys raced:  %llu\n",  atomic64_read(&stats->keys_raced));
1134         prt_printf(out, "bytes seen:  ");
1135         prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1136         prt_newline(out);
1137
1138         prt_printf(out, "bytes moved: ");
1139         prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1140         prt_newline(out);
1141
1142         prt_printf(out, "bytes raced: ");
1143         prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1144         prt_newline(out);
1145
1146         printbuf_indent_sub(out, 2);
1147 }
1148
1149 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1150 {
1151         struct moving_io *io;
1152
1153         bch2_move_stats_to_text(out, ctxt->stats);
1154         printbuf_indent_add(out, 2);
1155
1156         prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
1157                    atomic_read(&ctxt->read_ios),
1158                    c->opts.move_ios_in_flight,
1159                    atomic_read(&ctxt->read_sectors),
1160                    c->opts.move_bytes_in_flight >> 9);
1161
1162         prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
1163                    atomic_read(&ctxt->write_ios),
1164                    c->opts.move_ios_in_flight,
1165                    atomic_read(&ctxt->write_sectors),
1166                    c->opts.move_bytes_in_flight >> 9);
1167
1168         printbuf_indent_add(out, 2);
1169
1170         mutex_lock(&ctxt->lock);
1171         list_for_each_entry(io, &ctxt->ios, io_list)
1172                 bch2_write_op_to_text(out, &io->write.op);
1173         mutex_unlock(&ctxt->lock);
1174
1175         printbuf_indent_sub(out, 4);
1176 }
1177
1178 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1179 {
1180         struct moving_context *ctxt;
1181
1182         mutex_lock(&c->moving_context_lock);
1183         list_for_each_entry(ctxt, &c->moving_context_list, list)
1184                 bch2_moving_ctxt_to_text(out, c, ctxt);
1185         mutex_unlock(&c->moving_context_lock);
1186 }
1187
1188 void bch2_fs_move_init(struct bch_fs *c)
1189 {
1190         INIT_LIST_HEAD(&c->moving_context_list);
1191         mutex_init(&c->moving_context_lock);
1192 }
This page took 0.098541 seconds and 4 git commands to generate.