]>
Commit | Line | Data |
---|---|---|
103c1972 CH |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
4 | * Copyright (C) 2022 Christoph Hellwig. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
8 | #include "bio.h" | |
9 | #include "ctree.h" | |
10 | #include "volumes.h" | |
11 | #include "raid56.h" | |
12 | #include "async-thread.h" | |
103c1972 CH |
13 | #include "dev-replace.h" |
14 | #include "rcu-string.h" | |
15 | #include "zoned.h" | |
1c2b3ee3 | 16 | #include "file-item.h" |
02c372e1 | 17 | #include "raid-stripe-tree.h" |
103c1972 CH |
18 | |
19 | static struct bio_set btrfs_bioset; | |
852eee62 | 20 | static struct bio_set btrfs_clone_bioset; |
7609afac CH |
21 | static struct bio_set btrfs_repair_bioset; |
22 | static mempool_t btrfs_failed_bio_pool; | |
23 | ||
24 | struct btrfs_failed_bio { | |
25 | struct btrfs_bio *bbio; | |
26 | int num_copies; | |
27 | atomic_t repair_count; | |
28 | }; | |
103c1972 | 29 | |
fbe96087 CH |
30 | /* Is this a data path I/O that needs storage layer checksum and repair? */ |
31 | static inline bool is_data_bbio(struct btrfs_bio *bbio) | |
32 | { | |
33 | return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); | |
34 | } | |
35 | ||
ec63b84d CH |
36 | static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) |
37 | { | |
38 | return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; | |
39 | } | |
40 | ||
103c1972 CH |
41 | /* |
42 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it | |
43 | * is already initialized by the block layer. | |
44 | */ | |
4317ff00 | 45 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, |
67d66982 | 46 | btrfs_bio_end_io_t end_io, void *private) |
103c1972 CH |
47 | { |
48 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); | |
4317ff00 | 49 | bbio->fs_info = fs_info; |
103c1972 CH |
50 | bbio->end_io = end_io; |
51 | bbio->private = private; | |
852eee62 | 52 | atomic_set(&bbio->pending_ios, 1); |
103c1972 CH |
53 | } |
54 | ||
55 | /* | |
56 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for | |
57 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. | |
58 | * | |
59 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by | |
60 | * a mempool. | |
61 | */ | |
b41bbd29 | 62 | struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, |
4317ff00 | 63 | struct btrfs_fs_info *fs_info, |
b41bbd29 | 64 | btrfs_bio_end_io_t end_io, void *private) |
103c1972 | 65 | { |
b41bbd29 | 66 | struct btrfs_bio *bbio; |
103c1972 CH |
67 | struct bio *bio; |
68 | ||
69 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); | |
b41bbd29 | 70 | bbio = btrfs_bio(bio); |
4317ff00 | 71 | btrfs_bio_init(bbio, fs_info, end_io, private); |
b41bbd29 | 72 | return bbio; |
103c1972 CH |
73 | } |
74 | ||
2cef0c79 CH |
75 | static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
76 | struct btrfs_bio *orig_bbio, | |
77 | u64 map_length, bool use_append) | |
852eee62 | 78 | { |
2cef0c79 | 79 | struct btrfs_bio *bbio; |
852eee62 CH |
80 | struct bio *bio; |
81 | ||
d5e4377d CH |
82 | if (use_append) { |
83 | unsigned int nr_segs; | |
84 | ||
2cef0c79 | 85 | bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, |
d5e4377d CH |
86 | &btrfs_clone_bioset, map_length); |
87 | } else { | |
2cef0c79 CH |
88 | bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, |
89 | GFP_NOFS, &btrfs_clone_bioset); | |
d5e4377d | 90 | } |
2cef0c79 | 91 | bbio = btrfs_bio(bio); |
4317ff00 QW |
92 | btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); |
93 | bbio->inode = orig_bbio->inode; | |
2cef0c79 | 94 | bbio->file_offset = orig_bbio->file_offset; |
c731cd0b | 95 | orig_bbio->file_offset += map_length; |
ec63b84d CH |
96 | if (bbio_has_ordered_extent(bbio)) { |
97 | refcount_inc(&orig_bbio->ordered->refs); | |
98 | bbio->ordered = orig_bbio->ordered; | |
99 | } | |
852eee62 | 100 | atomic_inc(&orig_bbio->pending_ios); |
2cef0c79 | 101 | return bbio; |
852eee62 CH |
102 | } |
103 | ||
ec63b84d CH |
104 | /* Free a bio that was never submitted to the underlying device. */ |
105 | static void btrfs_cleanup_bio(struct btrfs_bio *bbio) | |
106 | { | |
107 | if (bbio_has_ordered_extent(bbio)) | |
108 | btrfs_put_ordered_extent(bbio->ordered); | |
109 | bio_put(&bbio->bio); | |
110 | } | |
111 | ||
112 | static void __btrfs_bio_end_io(struct btrfs_bio *bbio) | |
113 | { | |
114 | if (bbio_has_ordered_extent(bbio)) { | |
115 | struct btrfs_ordered_extent *ordered = bbio->ordered; | |
116 | ||
117 | bbio->end_io(bbio); | |
118 | btrfs_put_ordered_extent(ordered); | |
119 | } else { | |
120 | bbio->end_io(bbio); | |
121 | } | |
122 | } | |
123 | ||
124 | void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) | |
125 | { | |
126 | bbio->bio.bi_status = status; | |
127 | __btrfs_bio_end_io(bbio); | |
128 | } | |
129 | ||
852eee62 CH |
130 | static void btrfs_orig_write_end_io(struct bio *bio); |
131 | ||
132 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, | |
133 | struct btrfs_bio *orig_bbio) | |
134 | { | |
135 | /* | |
136 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't | |
137 | * just blindly propagate a write failure here. Instead increment the | |
138 | * error count in the original I/O context so that it is guaranteed to | |
139 | * be larger than the error tolerance. | |
140 | */ | |
141 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { | |
142 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; | |
143 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; | |
144 | ||
145 | atomic_add(orig_bioc->max_errors, &orig_bioc->error); | |
146 | } else { | |
147 | orig_bbio->bio.bi_status = bbio->bio.bi_status; | |
148 | } | |
149 | } | |
150 | ||
151 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) | |
152 | { | |
153 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { | |
154 | struct btrfs_bio *orig_bbio = bbio->private; | |
155 | ||
156 | if (bbio->bio.bi_status) | |
157 | btrfs_bbio_propagate_error(bbio, orig_bbio); | |
ec63b84d | 158 | btrfs_cleanup_bio(bbio); |
852eee62 CH |
159 | bbio = orig_bbio; |
160 | } | |
161 | ||
162 | if (atomic_dec_and_test(&bbio->pending_ios)) | |
ec63b84d | 163 | __btrfs_bio_end_io(bbio); |
852eee62 CH |
164 | } |
165 | ||
7609afac CH |
166 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
167 | { | |
168 | if (cur_mirror == fbio->num_copies) | |
169 | return cur_mirror + 1 - fbio->num_copies; | |
170 | return cur_mirror + 1; | |
171 | } | |
172 | ||
173 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) | |
174 | { | |
175 | if (cur_mirror == 1) | |
176 | return fbio->num_copies; | |
177 | return cur_mirror - 1; | |
178 | } | |
179 | ||
180 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) | |
181 | { | |
182 | if (atomic_dec_and_test(&fbio->repair_count)) { | |
852eee62 | 183 | btrfs_orig_bbio_end_io(fbio->bbio); |
7609afac CH |
184 | mempool_free(fbio, &btrfs_failed_bio_pool); |
185 | } | |
186 | } | |
187 | ||
188 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, | |
189 | struct btrfs_device *dev) | |
190 | { | |
191 | struct btrfs_failed_bio *fbio = repair_bbio->private; | |
192 | struct btrfs_inode *inode = repair_bbio->inode; | |
193 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
194 | struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); | |
195 | int mirror = repair_bbio->mirror_num; | |
196 | ||
96c36eaa QW |
197 | /* |
198 | * We can only trigger this for data bio, which doesn't support larger | |
199 | * folios yet. | |
200 | */ | |
201 | ASSERT(folio_order(page_folio(bv->bv_page)) == 0); | |
202 | ||
7609afac CH |
203 | if (repair_bbio->bio.bi_status || |
204 | !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { | |
205 | bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); | |
0d3acb25 | 206 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
7609afac CH |
207 | |
208 | mirror = next_repair_mirror(fbio, mirror); | |
209 | if (mirror == fbio->bbio->mirror_num) { | |
210 | btrfs_debug(fs_info, "no mirror left"); | |
211 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; | |
212 | goto done; | |
213 | } | |
214 | ||
ae42a154 | 215 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
216 | return; |
217 | } | |
218 | ||
219 | do { | |
220 | mirror = prev_repair_mirror(fbio, mirror); | |
221 | btrfs_repair_io_failure(fs_info, btrfs_ino(inode), | |
222 | repair_bbio->file_offset, fs_info->sectorsize, | |
0d3acb25 | 223 | repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
96c36eaa | 224 | page_folio(bv->bv_page), bv->bv_offset, mirror); |
7609afac CH |
225 | } while (mirror != fbio->bbio->mirror_num); |
226 | ||
227 | done: | |
228 | btrfs_repair_done(fbio); | |
229 | bio_put(&repair_bbio->bio); | |
230 | } | |
231 | ||
232 | /* | |
233 | * Try to kick off a repair read to the next available mirror for a bad sector. | |
234 | * | |
235 | * This primarily tries to recover good data to serve the actual read request, | |
236 | * but also tries to write the good data back to the bad mirror(s) when a | |
237 | * read succeeded to restore the redundancy. | |
238 | */ | |
239 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, | |
240 | u32 bio_offset, | |
241 | struct bio_vec *bv, | |
242 | struct btrfs_failed_bio *fbio) | |
243 | { | |
244 | struct btrfs_inode *inode = failed_bbio->inode; | |
245 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
246 | const u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 247 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
7609afac CH |
248 | struct btrfs_bio *repair_bbio; |
249 | struct bio *repair_bio; | |
250 | int num_copies; | |
251 | int mirror; | |
252 | ||
253 | btrfs_debug(fs_info, "repair read error: read error at %llu", | |
254 | failed_bbio->file_offset + bio_offset); | |
255 | ||
256 | num_copies = btrfs_num_copies(fs_info, logical, sectorsize); | |
257 | if (num_copies == 1) { | |
258 | btrfs_debug(fs_info, "no copy to repair from"); | |
259 | failed_bbio->bio.bi_status = BLK_STS_IOERR; | |
260 | return fbio; | |
261 | } | |
262 | ||
263 | if (!fbio) { | |
264 | fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); | |
265 | fbio->bbio = failed_bbio; | |
266 | fbio->num_copies = num_copies; | |
267 | atomic_set(&fbio->repair_count, 1); | |
268 | } | |
269 | ||
270 | atomic_inc(&fbio->repair_count); | |
271 | ||
272 | repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, | |
273 | &btrfs_repair_bioset); | |
0d3acb25 | 274 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
078e4cf5 | 275 | __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); |
7609afac CH |
276 | |
277 | repair_bbio = btrfs_bio(repair_bio); | |
4317ff00 QW |
278 | btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); |
279 | repair_bbio->inode = failed_bbio->inode; | |
7609afac CH |
280 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; |
281 | ||
282 | mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); | |
283 | btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); | |
ae42a154 | 284 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
285 | return fbio; |
286 | } | |
287 | ||
288 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) | |
289 | { | |
290 | struct btrfs_inode *inode = bbio->inode; | |
291 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
292 | u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 293 | struct bvec_iter *iter = &bbio->saved_iter; |
7609afac CH |
294 | blk_status_t status = bbio->bio.bi_status; |
295 | struct btrfs_failed_bio *fbio = NULL; | |
296 | u32 offset = 0; | |
297 | ||
4317ff00 QW |
298 | /* Read-repair requires the inode field to be set by the submitter. */ |
299 | ASSERT(inode); | |
300 | ||
7609afac CH |
301 | /* |
302 | * Hand off repair bios to the repair code as there is no upper level | |
303 | * submitter for them. | |
304 | */ | |
305 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { | |
306 | btrfs_end_repair_bio(bbio, dev); | |
307 | return; | |
308 | } | |
309 | ||
310 | /* Clear the I/O error. A failed repair will reset it. */ | |
311 | bbio->bio.bi_status = BLK_STS_OK; | |
312 | ||
313 | while (iter->bi_size) { | |
314 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); | |
315 | ||
316 | bv.bv_len = min(bv.bv_len, sectorsize); | |
317 | if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) | |
318 | fbio = repair_one_sector(bbio, offset, &bv, fbio); | |
319 | ||
320 | bio_advance_iter_single(&bbio->bio, iter, sectorsize); | |
321 | offset += sectorsize; | |
322 | } | |
323 | ||
7ab0fdfc CH |
324 | if (bbio->csum != bbio->csum_inline) |
325 | kfree(bbio->csum); | |
7609afac CH |
326 | |
327 | if (fbio) | |
328 | btrfs_repair_done(fbio); | |
329 | else | |
852eee62 | 330 | btrfs_orig_bbio_end_io(bbio); |
7609afac CH |
331 | } |
332 | ||
103c1972 CH |
333 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
334 | { | |
335 | if (!dev || !dev->bdev) | |
336 | return; | |
337 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) | |
338 | return; | |
339 | ||
340 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) | |
341 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
98e8d36a | 342 | else if (!(bio->bi_opf & REQ_RAHEAD)) |
103c1972 CH |
343 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
344 | if (bio->bi_opf & REQ_PREFLUSH) | |
345 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); | |
346 | } | |
347 | ||
348 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, | |
349 | struct bio *bio) | |
350 | { | |
351 | if (bio->bi_opf & REQ_META) | |
352 | return fs_info->endio_meta_workers; | |
353 | return fs_info->endio_workers; | |
354 | } | |
355 | ||
356 | static void btrfs_end_bio_work(struct work_struct *work) | |
357 | { | |
358 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); | |
359 | ||
7609afac | 360 | /* Metadata reads are checked and repaired by the submitter. */ |
fbe96087 | 361 | if (is_data_bbio(bbio)) |
860c8c45 | 362 | btrfs_check_read_bio(bbio, bbio->bio.bi_private); |
4317ff00 | 363 | else |
45c2f368 | 364 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
365 | } |
366 | ||
367 | static void btrfs_simple_end_io(struct bio *bio) | |
368 | { | |
103c1972 | 369 | struct btrfs_bio *bbio = btrfs_bio(bio); |
860c8c45 | 370 | struct btrfs_device *dev = bio->bi_private; |
4317ff00 | 371 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
103c1972 CH |
372 | |
373 | btrfs_bio_counter_dec(fs_info); | |
374 | ||
375 | if (bio->bi_status) | |
860c8c45 | 376 | btrfs_log_dev_io_error(bio, dev); |
103c1972 CH |
377 | |
378 | if (bio_op(bio) == REQ_OP_READ) { | |
379 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); | |
380 | queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); | |
381 | } else { | |
e9cb93b9 | 382 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
69ccf3f4 | 383 | btrfs_record_physical_zoned(bbio); |
852eee62 | 384 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
385 | } |
386 | } | |
387 | ||
388 | static void btrfs_raid56_end_io(struct bio *bio) | |
389 | { | |
390 | struct btrfs_io_context *bioc = bio->bi_private; | |
391 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
392 | ||
393 | btrfs_bio_counter_dec(bioc->fs_info); | |
394 | bbio->mirror_num = bioc->mirror_num; | |
fbe96087 | 395 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) |
7609afac CH |
396 | btrfs_check_read_bio(bbio, NULL); |
397 | else | |
852eee62 | 398 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
399 | |
400 | btrfs_put_bioc(bioc); | |
401 | } | |
402 | ||
403 | static void btrfs_orig_write_end_io(struct bio *bio) | |
404 | { | |
405 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
406 | struct btrfs_io_context *bioc = stripe->bioc; | |
407 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
408 | ||
409 | btrfs_bio_counter_dec(bioc->fs_info); | |
410 | ||
411 | if (bio->bi_status) { | |
412 | atomic_inc(&bioc->error); | |
413 | btrfs_log_dev_io_error(bio, stripe->dev); | |
414 | } | |
415 | ||
416 | /* | |
417 | * Only send an error to the higher layers if it is beyond the tolerance | |
418 | * threshold. | |
419 | */ | |
420 | if (atomic_read(&bioc->error) > bioc->max_errors) | |
421 | bio->bi_status = BLK_STS_IOERR; | |
422 | else | |
423 | bio->bi_status = BLK_STS_OK; | |
424 | ||
02c372e1 JT |
425 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
426 | stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
427 | ||
852eee62 | 428 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
429 | btrfs_put_bioc(bioc); |
430 | } | |
431 | ||
432 | static void btrfs_clone_write_end_io(struct bio *bio) | |
433 | { | |
434 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
435 | ||
436 | if (bio->bi_status) { | |
437 | atomic_inc(&stripe->bioc->error); | |
438 | btrfs_log_dev_io_error(bio, stripe->dev); | |
02c372e1 JT |
439 | } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
440 | stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
103c1972 CH |
441 | } |
442 | ||
443 | /* Pass on control to the original bio this one was cloned from */ | |
444 | bio_endio(stripe->bioc->orig_bio); | |
445 | bio_put(bio); | |
446 | } | |
447 | ||
448 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) | |
449 | { | |
450 | if (!dev || !dev->bdev || | |
451 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || | |
452 | (btrfs_op(bio) == BTRFS_MAP_WRITE && | |
453 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { | |
454 | bio_io_error(bio); | |
455 | return; | |
456 | } | |
457 | ||
458 | bio_set_dev(bio, dev->bdev); | |
459 | ||
460 | /* | |
461 | * For zone append writing, bi_sector must point the beginning of the | |
462 | * zone | |
463 | */ | |
464 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
465 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
d5e4377d | 466 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
103c1972 | 467 | |
d5e4377d CH |
468 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
469 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; | |
103c1972 CH |
470 | } |
471 | btrfs_debug_in_rcu(dev->fs_info, | |
472 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", | |
473 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, | |
474 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), | |
475 | dev->devid, bio->bi_iter.bi_size); | |
476 | ||
3480373e CH |
477 | if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) |
478 | blkcg_punt_bio_submit(bio); | |
479 | else | |
480 | submit_bio(bio); | |
103c1972 CH |
481 | } |
482 | ||
483 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) | |
484 | { | |
485 | struct bio *orig_bio = bioc->orig_bio, *bio; | |
486 | ||
487 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); | |
488 | ||
489 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ | |
490 | if (dev_nr == bioc->num_stripes - 1) { | |
491 | bio = orig_bio; | |
492 | bio->bi_end_io = btrfs_orig_write_end_io; | |
493 | } else { | |
494 | bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); | |
495 | bio_inc_remaining(orig_bio); | |
496 | bio->bi_end_io = btrfs_clone_write_end_io; | |
497 | } | |
498 | ||
499 | bio->bi_private = &bioc->stripes[dev_nr]; | |
500 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; | |
501 | bioc->stripes[dev_nr].bioc = bioc; | |
02c372e1 | 502 | bioc->size = bio->bi_iter.bi_size; |
103c1972 CH |
503 | btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); |
504 | } | |
505 | ||
f8a53bb5 CH |
506 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
507 | struct btrfs_io_stripe *smap, int mirror_num) | |
508 | { | |
f8a53bb5 CH |
509 | if (!bioc) { |
510 | /* Single mirror read/write fast path. */ | |
511 | btrfs_bio(bio)->mirror_num = mirror_num; | |
02c372e1 JT |
512 | if (bio_op(bio) != REQ_OP_READ) |
513 | btrfs_bio(bio)->orig_physical = smap->physical; | |
f8a53bb5 | 514 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; |
3887653c CH |
515 | if (bio_op(bio) != REQ_OP_READ) |
516 | btrfs_bio(bio)->orig_physical = smap->physical; | |
f8a53bb5 CH |
517 | bio->bi_private = smap->dev; |
518 | bio->bi_end_io = btrfs_simple_end_io; | |
519 | btrfs_submit_dev_bio(smap->dev, bio); | |
520 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | |
521 | /* Parity RAID write or read recovery. */ | |
522 | bio->bi_private = bioc; | |
523 | bio->bi_end_io = btrfs_raid56_end_io; | |
524 | if (bio_op(bio) == REQ_OP_READ) | |
525 | raid56_parity_recover(bio, bioc, mirror_num); | |
526 | else | |
527 | raid56_parity_write(bio, bioc); | |
528 | } else { | |
529 | /* Write to multiple mirrors. */ | |
530 | int total_devs = bioc->num_stripes; | |
531 | ||
532 | bioc->orig_bio = bio; | |
533 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) | |
534 | btrfs_submit_mirrored_bio(bioc, dev_nr); | |
535 | } | |
536 | } | |
537 | ||
538 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) | |
539 | { | |
540 | if (bbio->bio.bi_opf & REQ_META) | |
542e300e | 541 | return btree_csum_one_bio(bbio); |
f8a53bb5 CH |
542 | return btrfs_csum_one_bio(bbio); |
543 | } | |
544 | ||
545 | /* | |
546 | * Async submit bios are used to offload expensive checksumming onto the worker | |
547 | * threads. | |
548 | */ | |
549 | struct async_submit_bio { | |
550 | struct btrfs_bio *bbio; | |
551 | struct btrfs_io_context *bioc; | |
552 | struct btrfs_io_stripe smap; | |
553 | int mirror_num; | |
554 | struct btrfs_work work; | |
555 | }; | |
556 | ||
557 | /* | |
558 | * In order to insert checksums into the metadata in large chunks, we wait | |
559 | * until bio submission time. All the pages in the bio are checksummed and | |
560 | * sums are attached onto the ordered extent record. | |
561 | * | |
562 | * At IO completion time the csums attached on the ordered extent record are | |
563 | * inserted into the btree. | |
564 | */ | |
565 | static void run_one_async_start(struct btrfs_work *work) | |
566 | { | |
567 | struct async_submit_bio *async = | |
568 | container_of(work, struct async_submit_bio, work); | |
569 | blk_status_t ret; | |
570 | ||
571 | ret = btrfs_bio_csum(async->bbio); | |
572 | if (ret) | |
573 | async->bbio->bio.bi_status = ret; | |
574 | } | |
575 | ||
576 | /* | |
577 | * In order to insert checksums into the metadata in large chunks, we wait | |
578 | * until bio submission time. All the pages in the bio are checksummed and | |
579 | * sums are attached onto the ordered extent record. | |
580 | * | |
581 | * At IO completion time the csums attached on the ordered extent record are | |
582 | * inserted into the tree. | |
078b8b90 DS |
583 | * |
584 | * If called with @do_free == true, then it will free the work struct. | |
f8a53bb5 | 585 | */ |
078b8b90 | 586 | static void run_one_async_done(struct btrfs_work *work, bool do_free) |
f8a53bb5 CH |
587 | { |
588 | struct async_submit_bio *async = | |
589 | container_of(work, struct async_submit_bio, work); | |
590 | struct bio *bio = &async->bbio->bio; | |
591 | ||
078b8b90 DS |
592 | if (do_free) { |
593 | kfree(container_of(work, struct async_submit_bio, work)); | |
594 | return; | |
595 | } | |
596 | ||
f8a53bb5 CH |
597 | /* If an error occurred we just want to clean up the bio and move on. */ |
598 | if (bio->bi_status) { | |
852eee62 | 599 | btrfs_orig_bbio_end_io(async->bbio); |
f8a53bb5 CH |
600 | return; |
601 | } | |
602 | ||
603 | /* | |
604 | * All of the bios that pass through here are from async helpers. | |
3480373e CH |
605 | * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's |
606 | * context. This changes nothing when cgroups aren't in use. | |
f8a53bb5 | 607 | */ |
3480373e | 608 | bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; |
f8a53bb5 CH |
609 | __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); |
610 | } | |
611 | ||
f8a53bb5 CH |
612 | static bool should_async_write(struct btrfs_bio *bbio) |
613 | { | |
da023618 CH |
614 | /* Submit synchronously if the checksum implementation is fast. */ |
615 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) | |
616 | return false; | |
617 | ||
f8a53bb5 | 618 | /* |
e917ff56 CH |
619 | * Try to defer the submission to a workqueue to parallelize the |
620 | * checksum calculation unless the I/O is issued synchronously. | |
f8a53bb5 | 621 | */ |
e917ff56 | 622 | if (op_is_sync(bbio->bio.bi_opf)) |
f8a53bb5 CH |
623 | return false; |
624 | ||
da023618 CH |
625 | /* Zoned devices require I/O to be submitted in order. */ |
626 | if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) | |
627 | return false; | |
f8a53bb5 CH |
628 | |
629 | return true; | |
630 | } | |
631 | ||
632 | /* | |
633 | * Submit bio to an async queue. | |
634 | * | |
eefaf0a1 | 635 | * Return true if the work has been successfully submitted, else false. |
f8a53bb5 CH |
636 | */ |
637 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, | |
638 | struct btrfs_io_context *bioc, | |
639 | struct btrfs_io_stripe *smap, int mirror_num) | |
640 | { | |
4317ff00 | 641 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
f8a53bb5 CH |
642 | struct async_submit_bio *async; |
643 | ||
644 | async = kmalloc(sizeof(*async), GFP_NOFS); | |
645 | if (!async) | |
646 | return false; | |
647 | ||
648 | async->bbio = bbio; | |
649 | async->bioc = bioc; | |
650 | async->smap = *smap; | |
651 | async->mirror_num = mirror_num; | |
652 | ||
078b8b90 | 653 | btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); |
8bfec2e4 | 654 | btrfs_queue_work(fs_info->workers, &async->work); |
f8a53bb5 CH |
655 | return true; |
656 | } | |
657 | ||
ae42a154 | 658 | static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) |
103c1972 | 659 | { |
d5e4377d | 660 | struct btrfs_inode *inode = bbio->inode; |
4317ff00 | 661 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
852eee62 | 662 | struct btrfs_bio *orig_bbio = bbio; |
ae42a154 | 663 | struct bio *bio = &bbio->bio; |
adbe7e38 | 664 | u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
103c1972 CH |
665 | u64 length = bio->bi_iter.bi_size; |
666 | u64 map_length = length; | |
921603c7 | 667 | bool use_append = btrfs_use_zone_append(bbio); |
103c1972 CH |
668 | struct btrfs_io_context *bioc = NULL; |
669 | struct btrfs_io_stripe smap; | |
9ba0004b CH |
670 | blk_status_t ret; |
671 | int error; | |
103c1972 | 672 | |
9acaa641 JT |
673 | smap.is_scrub = !bbio->inode; |
674 | ||
103c1972 | 675 | btrfs_bio_counter_inc_blocked(fs_info); |
cd4efd21 | 676 | error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, |
9fb2acc2 | 677 | &bioc, &smap, &mirror_num); |
9ba0004b CH |
678 | if (error) { |
679 | ret = errno_to_blk_status(error); | |
680 | goto fail; | |
103c1972 CH |
681 | } |
682 | ||
852eee62 | 683 | map_length = min(map_length, length); |
d5e4377d CH |
684 | if (use_append) |
685 | map_length = min(map_length, fs_info->max_zone_append_size); | |
686 | ||
103c1972 | 687 | if (map_length < length) { |
2cef0c79 CH |
688 | bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); |
689 | bio = &bbio->bio; | |
103c1972 CH |
690 | } |
691 | ||
1c2b3ee3 CH |
692 | /* |
693 | * Save the iter for the end_io handler and preload the checksums for | |
694 | * data reads. | |
695 | */ | |
fbe96087 | 696 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { |
0d3acb25 | 697 | bbio->saved_iter = bio->bi_iter; |
1c2b3ee3 CH |
698 | ret = btrfs_lookup_bio_sums(bbio); |
699 | if (ret) | |
852eee62 | 700 | goto fail_put_bio; |
1c2b3ee3 | 701 | } |
7276aa7d | 702 | |
f8a53bb5 | 703 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
d5e4377d CH |
704 | if (use_append) { |
705 | bio->bi_opf &= ~REQ_OP_WRITE; | |
706 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
69ccf3f4 CH |
707 | } |
708 | ||
02c372e1 JT |
709 | if (is_data_bbio(bbio) && bioc && |
710 | btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { | |
711 | /* | |
712 | * No locking for the list update, as we only add to | |
713 | * the list in the I/O submission path, and list | |
714 | * iteration only happens in the completion path, which | |
715 | * can't happen until after the last submission. | |
716 | */ | |
717 | btrfs_get_bioc(bioc); | |
718 | list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); | |
719 | } | |
720 | ||
f8a53bb5 CH |
721 | /* |
722 | * Csum items for reloc roots have already been cloned at this | |
723 | * point, so they are handled as part of the no-checksum case. | |
724 | */ | |
4317ff00 | 725 | if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && |
f8a53bb5 | 726 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
d5e4377d | 727 | !btrfs_is_data_reloc_root(inode->root)) { |
f8a53bb5 CH |
728 | if (should_async_write(bbio) && |
729 | btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) | |
852eee62 | 730 | goto done; |
f8a53bb5 CH |
731 | |
732 | ret = btrfs_bio_csum(bbio); | |
733 | if (ret) | |
852eee62 | 734 | goto fail_put_bio; |
cbfce4c7 CH |
735 | } else if (use_append) { |
736 | ret = btrfs_alloc_dummy_sum(bbio); | |
737 | if (ret) | |
738 | goto fail_put_bio; | |
f8a53bb5 | 739 | } |
103c1972 | 740 | } |
f8a53bb5 CH |
741 | |
742 | __btrfs_submit_bio(bio, bioc, &smap, mirror_num); | |
852eee62 CH |
743 | done: |
744 | return map_length == length; | |
9ba0004b | 745 | |
852eee62 CH |
746 | fail_put_bio: |
747 | if (map_length < length) | |
ec63b84d | 748 | btrfs_cleanup_bio(bbio); |
9ba0004b CH |
749 | fail: |
750 | btrfs_bio_counter_dec(fs_info); | |
852eee62 CH |
751 | btrfs_bio_end_io(orig_bbio, ret); |
752 | /* Do not submit another chunk */ | |
753 | return true; | |
754 | } | |
755 | ||
ae42a154 | 756 | void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) |
852eee62 | 757 | { |
4317ff00 QW |
758 | /* If bbio->inode is not populated, its file_offset must be 0. */ |
759 | ASSERT(bbio->inode || bbio->file_offset == 0); | |
760 | ||
ae42a154 | 761 | while (!btrfs_submit_chunk(bbio, mirror_num)) |
852eee62 | 762 | ; |
103c1972 CH |
763 | } |
764 | ||
bacf60e5 CH |
765 | /* |
766 | * Submit a repair write. | |
767 | * | |
768 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a | |
769 | * RAID setup. Here we only want to write the one bad copy, so we do the | |
770 | * mapping ourselves and submit the bio directly. | |
771 | * | |
67da05b3 | 772 | * The I/O is issued synchronously to block the repair read completion from |
bacf60e5 CH |
773 | * freeing the bio. |
774 | */ | |
775 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, | |
96c36eaa QW |
776 | u64 length, u64 logical, struct folio *folio, |
777 | unsigned int folio_offset, int mirror_num) | |
bacf60e5 | 778 | { |
4886ff7b | 779 | struct btrfs_io_stripe smap = { 0 }; |
bacf60e5 CH |
780 | struct bio_vec bvec; |
781 | struct bio bio; | |
bacf60e5 CH |
782 | int ret = 0; |
783 | ||
784 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); | |
785 | BUG_ON(!mirror_num); | |
786 | ||
787 | if (btrfs_repair_one_zone(fs_info, logical)) | |
788 | return 0; | |
789 | ||
bacf60e5 CH |
790 | /* |
791 | * Avoid races with device replace and make sure our bioc has devices | |
792 | * associated to its stripes that don't go away while we are doing the | |
793 | * read repair operation. | |
794 | */ | |
795 | btrfs_bio_counter_inc_blocked(fs_info); | |
4886ff7b QW |
796 | ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); |
797 | if (ret < 0) | |
798 | goto out_counter_dec; | |
bacf60e5 | 799 | |
4886ff7b QW |
800 | if (!smap.dev->bdev || |
801 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { | |
bacf60e5 CH |
802 | ret = -EIO; |
803 | goto out_counter_dec; | |
804 | } | |
805 | ||
4886ff7b QW |
806 | bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); |
807 | bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; | |
96c36eaa QW |
808 | ret = bio_add_folio(&bio, folio, length, folio_offset); |
809 | ASSERT(ret); | |
bacf60e5 CH |
810 | ret = submit_bio_wait(&bio); |
811 | if (ret) { | |
812 | /* try to remap that extent elsewhere? */ | |
4886ff7b | 813 | btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); |
bacf60e5 CH |
814 | goto out_bio_uninit; |
815 | } | |
816 | ||
817 | btrfs_info_rl_in_rcu(fs_info, | |
818 | "read error corrected: ino %llu off %llu (dev %s sector %llu)", | |
4886ff7b QW |
819 | ino, start, btrfs_dev_name(smap.dev), |
820 | smap.physical >> SECTOR_SHIFT); | |
bacf60e5 CH |
821 | ret = 0; |
822 | ||
823 | out_bio_uninit: | |
824 | bio_uninit(&bio); | |
825 | out_counter_dec: | |
826 | btrfs_bio_counter_dec(fs_info); | |
827 | return ret; | |
828 | } | |
829 | ||
4886ff7b QW |
830 | /* |
831 | * Submit a btrfs_bio based repair write. | |
832 | * | |
833 | * If @dev_replace is true, the write would be submitted to dev-replace target. | |
834 | */ | |
835 | void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) | |
836 | { | |
837 | struct btrfs_fs_info *fs_info = bbio->fs_info; | |
838 | u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; | |
839 | u64 length = bbio->bio.bi_iter.bi_size; | |
840 | struct btrfs_io_stripe smap = { 0 }; | |
841 | int ret; | |
842 | ||
843 | ASSERT(fs_info); | |
844 | ASSERT(mirror_num > 0); | |
845 | ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); | |
846 | ASSERT(!bbio->inode); | |
847 | ||
848 | btrfs_bio_counter_inc_blocked(fs_info); | |
849 | ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); | |
850 | if (ret < 0) | |
851 | goto fail; | |
852 | ||
853 | if (dev_replace) { | |
4886ff7b QW |
854 | ASSERT(smap.dev == fs_info->dev_replace.srcdev); |
855 | smap.dev = fs_info->dev_replace.tgtdev; | |
856 | } | |
857 | __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); | |
858 | return; | |
859 | ||
860 | fail: | |
861 | btrfs_bio_counter_dec(fs_info); | |
862 | btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); | |
863 | } | |
864 | ||
103c1972 CH |
865 | int __init btrfs_bioset_init(void) |
866 | { | |
867 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, | |
868 | offsetof(struct btrfs_bio, bio), | |
869 | BIOSET_NEED_BVECS)) | |
870 | return -ENOMEM; | |
852eee62 CH |
871 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
872 | offsetof(struct btrfs_bio, bio), 0)) | |
873 | goto out_free_bioset; | |
7609afac CH |
874 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
875 | offsetof(struct btrfs_bio, bio), | |
876 | BIOSET_NEED_BVECS)) | |
852eee62 | 877 | goto out_free_clone_bioset; |
7609afac CH |
878 | if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, |
879 | sizeof(struct btrfs_failed_bio))) | |
880 | goto out_free_repair_bioset; | |
103c1972 | 881 | return 0; |
7609afac CH |
882 | |
883 | out_free_repair_bioset: | |
884 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 CH |
885 | out_free_clone_bioset: |
886 | bioset_exit(&btrfs_clone_bioset); | |
7609afac CH |
887 | out_free_bioset: |
888 | bioset_exit(&btrfs_bioset); | |
889 | return -ENOMEM; | |
103c1972 CH |
890 | } |
891 | ||
892 | void __cold btrfs_bioset_exit(void) | |
893 | { | |
7609afac CH |
894 | mempool_exit(&btrfs_failed_bio_pool); |
895 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 | 896 | bioset_exit(&btrfs_clone_bioset); |
103c1972 CH |
897 | bioset_exit(&btrfs_bioset); |
898 | } |