]>
Commit | Line | Data |
---|---|---|
103c1972 CH |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
4 | * Copyright (C) 2022 Christoph Hellwig. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
8 | #include "bio.h" | |
9 | #include "ctree.h" | |
10 | #include "volumes.h" | |
11 | #include "raid56.h" | |
12 | #include "async-thread.h" | |
13 | #include "check-integrity.h" | |
14 | #include "dev-replace.h" | |
15 | #include "rcu-string.h" | |
16 | #include "zoned.h" | |
1c2b3ee3 | 17 | #include "file-item.h" |
103c1972 CH |
18 | |
19 | static struct bio_set btrfs_bioset; | |
852eee62 | 20 | static struct bio_set btrfs_clone_bioset; |
7609afac CH |
21 | static struct bio_set btrfs_repair_bioset; |
22 | static mempool_t btrfs_failed_bio_pool; | |
23 | ||
24 | struct btrfs_failed_bio { | |
25 | struct btrfs_bio *bbio; | |
26 | int num_copies; | |
27 | atomic_t repair_count; | |
28 | }; | |
103c1972 CH |
29 | |
30 | /* | |
31 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it | |
32 | * is already initialized by the block layer. | |
33 | */ | |
67d66982 CH |
34 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, |
35 | btrfs_bio_end_io_t end_io, void *private) | |
103c1972 CH |
36 | { |
37 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); | |
d0e5cb2b | 38 | bbio->inode = inode; |
103c1972 CH |
39 | bbio->end_io = end_io; |
40 | bbio->private = private; | |
852eee62 | 41 | atomic_set(&bbio->pending_ios, 1); |
103c1972 CH |
42 | } |
43 | ||
44 | /* | |
45 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for | |
46 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. | |
47 | * | |
48 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by | |
49 | * a mempool. | |
50 | */ | |
b41bbd29 CH |
51 | struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, |
52 | struct btrfs_inode *inode, | |
53 | btrfs_bio_end_io_t end_io, void *private) | |
103c1972 | 54 | { |
b41bbd29 | 55 | struct btrfs_bio *bbio; |
103c1972 CH |
56 | struct bio *bio; |
57 | ||
58 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); | |
b41bbd29 CH |
59 | bbio = btrfs_bio(bio); |
60 | btrfs_bio_init(bbio, inode, end_io, private); | |
61 | return bbio; | |
103c1972 CH |
62 | } |
63 | ||
2cef0c79 CH |
64 | static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
65 | struct btrfs_bio *orig_bbio, | |
66 | u64 map_length, bool use_append) | |
852eee62 | 67 | { |
2cef0c79 | 68 | struct btrfs_bio *bbio; |
852eee62 CH |
69 | struct bio *bio; |
70 | ||
d5e4377d CH |
71 | if (use_append) { |
72 | unsigned int nr_segs; | |
73 | ||
2cef0c79 | 74 | bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, |
d5e4377d CH |
75 | &btrfs_clone_bioset, map_length); |
76 | } else { | |
2cef0c79 CH |
77 | bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, |
78 | GFP_NOFS, &btrfs_clone_bioset); | |
d5e4377d | 79 | } |
2cef0c79 CH |
80 | bbio = btrfs_bio(bio); |
81 | btrfs_bio_init(bbio, orig_bbio->inode, NULL, orig_bbio); | |
852eee62 | 82 | |
2cef0c79 CH |
83 | bbio->file_offset = orig_bbio->file_offset; |
84 | if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) | |
852eee62 CH |
85 | orig_bbio->file_offset += map_length; |
86 | ||
87 | atomic_inc(&orig_bbio->pending_ios); | |
2cef0c79 | 88 | return bbio; |
852eee62 CH |
89 | } |
90 | ||
91 | static void btrfs_orig_write_end_io(struct bio *bio); | |
92 | ||
93 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, | |
94 | struct btrfs_bio *orig_bbio) | |
95 | { | |
96 | /* | |
97 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't | |
98 | * just blindly propagate a write failure here. Instead increment the | |
99 | * error count in the original I/O context so that it is guaranteed to | |
100 | * be larger than the error tolerance. | |
101 | */ | |
102 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { | |
103 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; | |
104 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; | |
105 | ||
106 | atomic_add(orig_bioc->max_errors, &orig_bioc->error); | |
107 | } else { | |
108 | orig_bbio->bio.bi_status = bbio->bio.bi_status; | |
109 | } | |
110 | } | |
111 | ||
112 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) | |
113 | { | |
114 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { | |
115 | struct btrfs_bio *orig_bbio = bbio->private; | |
116 | ||
117 | if (bbio->bio.bi_status) | |
118 | btrfs_bbio_propagate_error(bbio, orig_bbio); | |
119 | bio_put(&bbio->bio); | |
120 | bbio = orig_bbio; | |
121 | } | |
122 | ||
123 | if (atomic_dec_and_test(&bbio->pending_ios)) | |
124 | bbio->end_io(bbio); | |
125 | } | |
126 | ||
7609afac CH |
127 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
128 | { | |
129 | if (cur_mirror == fbio->num_copies) | |
130 | return cur_mirror + 1 - fbio->num_copies; | |
131 | return cur_mirror + 1; | |
132 | } | |
133 | ||
134 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) | |
135 | { | |
136 | if (cur_mirror == 1) | |
137 | return fbio->num_copies; | |
138 | return cur_mirror - 1; | |
139 | } | |
140 | ||
141 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) | |
142 | { | |
143 | if (atomic_dec_and_test(&fbio->repair_count)) { | |
852eee62 | 144 | btrfs_orig_bbio_end_io(fbio->bbio); |
7609afac CH |
145 | mempool_free(fbio, &btrfs_failed_bio_pool); |
146 | } | |
147 | } | |
148 | ||
149 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, | |
150 | struct btrfs_device *dev) | |
151 | { | |
152 | struct btrfs_failed_bio *fbio = repair_bbio->private; | |
153 | struct btrfs_inode *inode = repair_bbio->inode; | |
154 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
155 | struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); | |
156 | int mirror = repair_bbio->mirror_num; | |
157 | ||
158 | if (repair_bbio->bio.bi_status || | |
159 | !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { | |
160 | bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); | |
0d3acb25 | 161 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
7609afac CH |
162 | |
163 | mirror = next_repair_mirror(fbio, mirror); | |
164 | if (mirror == fbio->bbio->mirror_num) { | |
165 | btrfs_debug(fs_info, "no mirror left"); | |
166 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; | |
167 | goto done; | |
168 | } | |
169 | ||
ae42a154 | 170 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
171 | return; |
172 | } | |
173 | ||
174 | do { | |
175 | mirror = prev_repair_mirror(fbio, mirror); | |
176 | btrfs_repair_io_failure(fs_info, btrfs_ino(inode), | |
177 | repair_bbio->file_offset, fs_info->sectorsize, | |
0d3acb25 | 178 | repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
7609afac CH |
179 | bv->bv_page, bv->bv_offset, mirror); |
180 | } while (mirror != fbio->bbio->mirror_num); | |
181 | ||
182 | done: | |
183 | btrfs_repair_done(fbio); | |
184 | bio_put(&repair_bbio->bio); | |
185 | } | |
186 | ||
187 | /* | |
188 | * Try to kick off a repair read to the next available mirror for a bad sector. | |
189 | * | |
190 | * This primarily tries to recover good data to serve the actual read request, | |
191 | * but also tries to write the good data back to the bad mirror(s) when a | |
192 | * read succeeded to restore the redundancy. | |
193 | */ | |
194 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, | |
195 | u32 bio_offset, | |
196 | struct bio_vec *bv, | |
197 | struct btrfs_failed_bio *fbio) | |
198 | { | |
199 | struct btrfs_inode *inode = failed_bbio->inode; | |
200 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
201 | const u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 202 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
7609afac CH |
203 | struct btrfs_bio *repair_bbio; |
204 | struct bio *repair_bio; | |
205 | int num_copies; | |
206 | int mirror; | |
207 | ||
208 | btrfs_debug(fs_info, "repair read error: read error at %llu", | |
209 | failed_bbio->file_offset + bio_offset); | |
210 | ||
211 | num_copies = btrfs_num_copies(fs_info, logical, sectorsize); | |
212 | if (num_copies == 1) { | |
213 | btrfs_debug(fs_info, "no copy to repair from"); | |
214 | failed_bbio->bio.bi_status = BLK_STS_IOERR; | |
215 | return fbio; | |
216 | } | |
217 | ||
218 | if (!fbio) { | |
219 | fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); | |
220 | fbio->bbio = failed_bbio; | |
221 | fbio->num_copies = num_copies; | |
222 | atomic_set(&fbio->repair_count, 1); | |
223 | } | |
224 | ||
225 | atomic_inc(&fbio->repair_count); | |
226 | ||
227 | repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, | |
228 | &btrfs_repair_bioset); | |
0d3acb25 | 229 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
7609afac CH |
230 | bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); |
231 | ||
232 | repair_bbio = btrfs_bio(repair_bio); | |
233 | btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); | |
234 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; | |
235 | ||
236 | mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); | |
237 | btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); | |
ae42a154 | 238 | btrfs_submit_bio(repair_bbio, mirror); |
7609afac CH |
239 | return fbio; |
240 | } | |
241 | ||
242 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) | |
243 | { | |
244 | struct btrfs_inode *inode = bbio->inode; | |
245 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
246 | u32 sectorsize = fs_info->sectorsize; | |
0d3acb25 | 247 | struct bvec_iter *iter = &bbio->saved_iter; |
7609afac CH |
248 | blk_status_t status = bbio->bio.bi_status; |
249 | struct btrfs_failed_bio *fbio = NULL; | |
250 | u32 offset = 0; | |
251 | ||
252 | /* | |
253 | * Hand off repair bios to the repair code as there is no upper level | |
254 | * submitter for them. | |
255 | */ | |
256 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { | |
257 | btrfs_end_repair_bio(bbio, dev); | |
258 | return; | |
259 | } | |
260 | ||
261 | /* Clear the I/O error. A failed repair will reset it. */ | |
262 | bbio->bio.bi_status = BLK_STS_OK; | |
263 | ||
264 | while (iter->bi_size) { | |
265 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); | |
266 | ||
267 | bv.bv_len = min(bv.bv_len, sectorsize); | |
268 | if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) | |
269 | fbio = repair_one_sector(bbio, offset, &bv, fbio); | |
270 | ||
271 | bio_advance_iter_single(&bbio->bio, iter, sectorsize); | |
272 | offset += sectorsize; | |
273 | } | |
274 | ||
7ab0fdfc CH |
275 | if (bbio->csum != bbio->csum_inline) |
276 | kfree(bbio->csum); | |
7609afac CH |
277 | |
278 | if (fbio) | |
279 | btrfs_repair_done(fbio); | |
280 | else | |
852eee62 | 281 | btrfs_orig_bbio_end_io(bbio); |
7609afac CH |
282 | } |
283 | ||
103c1972 CH |
284 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
285 | { | |
286 | if (!dev || !dev->bdev) | |
287 | return; | |
288 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) | |
289 | return; | |
290 | ||
291 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) | |
292 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
98e8d36a | 293 | else if (!(bio->bi_opf & REQ_RAHEAD)) |
103c1972 CH |
294 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
295 | if (bio->bi_opf & REQ_PREFLUSH) | |
296 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); | |
297 | } | |
298 | ||
299 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, | |
300 | struct bio *bio) | |
301 | { | |
302 | if (bio->bi_opf & REQ_META) | |
303 | return fs_info->endio_meta_workers; | |
304 | return fs_info->endio_workers; | |
305 | } | |
306 | ||
307 | static void btrfs_end_bio_work(struct work_struct *work) | |
308 | { | |
309 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); | |
310 | ||
7609afac CH |
311 | /* Metadata reads are checked and repaired by the submitter. */ |
312 | if (bbio->bio.bi_opf & REQ_META) | |
313 | bbio->end_io(bbio); | |
314 | else | |
860c8c45 | 315 | btrfs_check_read_bio(bbio, bbio->bio.bi_private); |
103c1972 CH |
316 | } |
317 | ||
318 | static void btrfs_simple_end_io(struct bio *bio) | |
319 | { | |
103c1972 | 320 | struct btrfs_bio *bbio = btrfs_bio(bio); |
860c8c45 CH |
321 | struct btrfs_device *dev = bio->bi_private; |
322 | struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; | |
103c1972 CH |
323 | |
324 | btrfs_bio_counter_dec(fs_info); | |
325 | ||
326 | if (bio->bi_status) | |
860c8c45 | 327 | btrfs_log_dev_io_error(bio, dev); |
103c1972 CH |
328 | |
329 | if (bio_op(bio) == REQ_OP_READ) { | |
330 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); | |
331 | queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); | |
332 | } else { | |
69ccf3f4 CH |
333 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) |
334 | btrfs_record_physical_zoned(bbio); | |
852eee62 | 335 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
336 | } |
337 | } | |
338 | ||
339 | static void btrfs_raid56_end_io(struct bio *bio) | |
340 | { | |
341 | struct btrfs_io_context *bioc = bio->bi_private; | |
342 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
343 | ||
344 | btrfs_bio_counter_dec(bioc->fs_info); | |
345 | bbio->mirror_num = bioc->mirror_num; | |
7609afac CH |
346 | if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) |
347 | btrfs_check_read_bio(bbio, NULL); | |
348 | else | |
852eee62 | 349 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
350 | |
351 | btrfs_put_bioc(bioc); | |
352 | } | |
353 | ||
354 | static void btrfs_orig_write_end_io(struct bio *bio) | |
355 | { | |
356 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
357 | struct btrfs_io_context *bioc = stripe->bioc; | |
358 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
359 | ||
360 | btrfs_bio_counter_dec(bioc->fs_info); | |
361 | ||
362 | if (bio->bi_status) { | |
363 | atomic_inc(&bioc->error); | |
364 | btrfs_log_dev_io_error(bio, stripe->dev); | |
365 | } | |
366 | ||
367 | /* | |
368 | * Only send an error to the higher layers if it is beyond the tolerance | |
369 | * threshold. | |
370 | */ | |
371 | if (atomic_read(&bioc->error) > bioc->max_errors) | |
372 | bio->bi_status = BLK_STS_IOERR; | |
373 | else | |
374 | bio->bi_status = BLK_STS_OK; | |
375 | ||
852eee62 | 376 | btrfs_orig_bbio_end_io(bbio); |
103c1972 CH |
377 | btrfs_put_bioc(bioc); |
378 | } | |
379 | ||
380 | static void btrfs_clone_write_end_io(struct bio *bio) | |
381 | { | |
382 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
383 | ||
384 | if (bio->bi_status) { | |
385 | atomic_inc(&stripe->bioc->error); | |
386 | btrfs_log_dev_io_error(bio, stripe->dev); | |
387 | } | |
388 | ||
389 | /* Pass on control to the original bio this one was cloned from */ | |
390 | bio_endio(stripe->bioc->orig_bio); | |
391 | bio_put(bio); | |
392 | } | |
393 | ||
394 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) | |
395 | { | |
396 | if (!dev || !dev->bdev || | |
397 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || | |
398 | (btrfs_op(bio) == BTRFS_MAP_WRITE && | |
399 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { | |
400 | bio_io_error(bio); | |
401 | return; | |
402 | } | |
403 | ||
404 | bio_set_dev(bio, dev->bdev); | |
405 | ||
406 | /* | |
407 | * For zone append writing, bi_sector must point the beginning of the | |
408 | * zone | |
409 | */ | |
410 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
411 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
d5e4377d | 412 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
103c1972 | 413 | |
d5e4377d CH |
414 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
415 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; | |
103c1972 CH |
416 | } |
417 | btrfs_debug_in_rcu(dev->fs_info, | |
418 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", | |
419 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, | |
420 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), | |
421 | dev->devid, bio->bi_iter.bi_size); | |
422 | ||
423 | btrfsic_check_bio(bio); | |
424 | submit_bio(bio); | |
425 | } | |
426 | ||
427 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) | |
428 | { | |
429 | struct bio *orig_bio = bioc->orig_bio, *bio; | |
430 | ||
431 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); | |
432 | ||
433 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ | |
434 | if (dev_nr == bioc->num_stripes - 1) { | |
435 | bio = orig_bio; | |
436 | bio->bi_end_io = btrfs_orig_write_end_io; | |
437 | } else { | |
438 | bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); | |
439 | bio_inc_remaining(orig_bio); | |
440 | bio->bi_end_io = btrfs_clone_write_end_io; | |
441 | } | |
442 | ||
443 | bio->bi_private = &bioc->stripes[dev_nr]; | |
444 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; | |
445 | bioc->stripes[dev_nr].bioc = bioc; | |
446 | btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); | |
447 | } | |
448 | ||
f8a53bb5 CH |
449 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
450 | struct btrfs_io_stripe *smap, int mirror_num) | |
451 | { | |
452 | /* Do not leak our private flag into the block layer. */ | |
453 | bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; | |
454 | ||
455 | if (!bioc) { | |
456 | /* Single mirror read/write fast path. */ | |
457 | btrfs_bio(bio)->mirror_num = mirror_num; | |
458 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; | |
459 | bio->bi_private = smap->dev; | |
460 | bio->bi_end_io = btrfs_simple_end_io; | |
461 | btrfs_submit_dev_bio(smap->dev, bio); | |
462 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | |
463 | /* Parity RAID write or read recovery. */ | |
464 | bio->bi_private = bioc; | |
465 | bio->bi_end_io = btrfs_raid56_end_io; | |
466 | if (bio_op(bio) == REQ_OP_READ) | |
467 | raid56_parity_recover(bio, bioc, mirror_num); | |
468 | else | |
469 | raid56_parity_write(bio, bioc); | |
470 | } else { | |
471 | /* Write to multiple mirrors. */ | |
472 | int total_devs = bioc->num_stripes; | |
473 | ||
474 | bioc->orig_bio = bio; | |
475 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) | |
476 | btrfs_submit_mirrored_bio(bioc, dev_nr); | |
477 | } | |
478 | } | |
479 | ||
480 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) | |
481 | { | |
482 | if (bbio->bio.bi_opf & REQ_META) | |
542e300e | 483 | return btree_csum_one_bio(bbio); |
f8a53bb5 CH |
484 | return btrfs_csum_one_bio(bbio); |
485 | } | |
486 | ||
487 | /* | |
488 | * Async submit bios are used to offload expensive checksumming onto the worker | |
489 | * threads. | |
490 | */ | |
491 | struct async_submit_bio { | |
492 | struct btrfs_bio *bbio; | |
493 | struct btrfs_io_context *bioc; | |
494 | struct btrfs_io_stripe smap; | |
495 | int mirror_num; | |
496 | struct btrfs_work work; | |
497 | }; | |
498 | ||
499 | /* | |
500 | * In order to insert checksums into the metadata in large chunks, we wait | |
501 | * until bio submission time. All the pages in the bio are checksummed and | |
502 | * sums are attached onto the ordered extent record. | |
503 | * | |
504 | * At IO completion time the csums attached on the ordered extent record are | |
505 | * inserted into the btree. | |
506 | */ | |
507 | static void run_one_async_start(struct btrfs_work *work) | |
508 | { | |
509 | struct async_submit_bio *async = | |
510 | container_of(work, struct async_submit_bio, work); | |
511 | blk_status_t ret; | |
512 | ||
513 | ret = btrfs_bio_csum(async->bbio); | |
514 | if (ret) | |
515 | async->bbio->bio.bi_status = ret; | |
516 | } | |
517 | ||
518 | /* | |
519 | * In order to insert checksums into the metadata in large chunks, we wait | |
520 | * until bio submission time. All the pages in the bio are checksummed and | |
521 | * sums are attached onto the ordered extent record. | |
522 | * | |
523 | * At IO completion time the csums attached on the ordered extent record are | |
524 | * inserted into the tree. | |
525 | */ | |
526 | static void run_one_async_done(struct btrfs_work *work) | |
527 | { | |
528 | struct async_submit_bio *async = | |
529 | container_of(work, struct async_submit_bio, work); | |
530 | struct bio *bio = &async->bbio->bio; | |
531 | ||
532 | /* If an error occurred we just want to clean up the bio and move on. */ | |
533 | if (bio->bi_status) { | |
852eee62 | 534 | btrfs_orig_bbio_end_io(async->bbio); |
f8a53bb5 CH |
535 | return; |
536 | } | |
537 | ||
538 | /* | |
539 | * All of the bios that pass through here are from async helpers. | |
540 | * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. | |
541 | * This changes nothing when cgroups aren't in use. | |
542 | */ | |
543 | bio->bi_opf |= REQ_CGROUP_PUNT; | |
544 | __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); | |
545 | } | |
546 | ||
547 | static void run_one_async_free(struct btrfs_work *work) | |
548 | { | |
549 | kfree(container_of(work, struct async_submit_bio, work)); | |
550 | } | |
551 | ||
552 | static bool should_async_write(struct btrfs_bio *bbio) | |
553 | { | |
554 | /* | |
555 | * If the I/O is not issued by fsync and friends, (->sync_writers != 0), | |
556 | * then try to defer the submission to a workqueue to parallelize the | |
557 | * checksum calculation. | |
558 | */ | |
559 | if (atomic_read(&bbio->inode->sync_writers)) | |
560 | return false; | |
561 | ||
562 | /* | |
563 | * Submit metadata writes synchronously if the checksum implementation | |
564 | * is fast, or we are on a zoned device that wants I/O to be submitted | |
565 | * in order. | |
566 | */ | |
567 | if (bbio->bio.bi_opf & REQ_META) { | |
568 | struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; | |
569 | ||
570 | if (btrfs_is_zoned(fs_info)) | |
571 | return false; | |
572 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) | |
573 | return false; | |
574 | } | |
575 | ||
576 | return true; | |
577 | } | |
578 | ||
579 | /* | |
580 | * Submit bio to an async queue. | |
581 | * | |
582 | * Return true if the work has been succesfuly submitted, else false. | |
583 | */ | |
584 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, | |
585 | struct btrfs_io_context *bioc, | |
586 | struct btrfs_io_stripe *smap, int mirror_num) | |
587 | { | |
588 | struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; | |
589 | struct async_submit_bio *async; | |
590 | ||
591 | async = kmalloc(sizeof(*async), GFP_NOFS); | |
592 | if (!async) | |
593 | return false; | |
594 | ||
595 | async->bbio = bbio; | |
596 | async->bioc = bioc; | |
597 | async->smap = *smap; | |
598 | async->mirror_num = mirror_num; | |
599 | ||
600 | btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, | |
601 | run_one_async_free); | |
602 | if (op_is_sync(bbio->bio.bi_opf)) | |
603 | btrfs_queue_work(fs_info->hipri_workers, &async->work); | |
604 | else | |
605 | btrfs_queue_work(fs_info->workers, &async->work); | |
606 | return true; | |
607 | } | |
608 | ||
ae42a154 | 609 | static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) |
103c1972 | 610 | { |
d5e4377d CH |
611 | struct btrfs_inode *inode = bbio->inode; |
612 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
852eee62 | 613 | struct btrfs_bio *orig_bbio = bbio; |
ae42a154 | 614 | struct bio *bio = &bbio->bio; |
103c1972 CH |
615 | u64 logical = bio->bi_iter.bi_sector << 9; |
616 | u64 length = bio->bi_iter.bi_size; | |
617 | u64 map_length = length; | |
921603c7 | 618 | bool use_append = btrfs_use_zone_append(bbio); |
103c1972 CH |
619 | struct btrfs_io_context *bioc = NULL; |
620 | struct btrfs_io_stripe smap; | |
9ba0004b CH |
621 | blk_status_t ret; |
622 | int error; | |
103c1972 CH |
623 | |
624 | btrfs_bio_counter_inc_blocked(fs_info); | |
9ba0004b CH |
625 | error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, |
626 | &bioc, &smap, &mirror_num, 1); | |
627 | if (error) { | |
628 | ret = errno_to_blk_status(error); | |
629 | goto fail; | |
103c1972 CH |
630 | } |
631 | ||
852eee62 | 632 | map_length = min(map_length, length); |
d5e4377d CH |
633 | if (use_append) |
634 | map_length = min(map_length, fs_info->max_zone_append_size); | |
635 | ||
103c1972 | 636 | if (map_length < length) { |
2cef0c79 CH |
637 | bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); |
638 | bio = &bbio->bio; | |
103c1972 CH |
639 | } |
640 | ||
1c2b3ee3 CH |
641 | /* |
642 | * Save the iter for the end_io handler and preload the checksums for | |
643 | * data reads. | |
644 | */ | |
645 | if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { | |
0d3acb25 | 646 | bbio->saved_iter = bio->bi_iter; |
1c2b3ee3 CH |
647 | ret = btrfs_lookup_bio_sums(bbio); |
648 | if (ret) | |
852eee62 | 649 | goto fail_put_bio; |
1c2b3ee3 | 650 | } |
7276aa7d | 651 | |
f8a53bb5 | 652 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
d5e4377d CH |
653 | if (use_append) { |
654 | bio->bi_opf &= ~REQ_OP_WRITE; | |
655 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
ae42a154 | 656 | ret = btrfs_extract_ordered_extent(bbio); |
69ccf3f4 | 657 | if (ret) |
852eee62 | 658 | goto fail_put_bio; |
69ccf3f4 CH |
659 | } |
660 | ||
f8a53bb5 CH |
661 | /* |
662 | * Csum items for reloc roots have already been cloned at this | |
663 | * point, so they are handled as part of the no-checksum case. | |
664 | */ | |
d5e4377d | 665 | if (!(inode->flags & BTRFS_INODE_NODATASUM) && |
f8a53bb5 | 666 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
d5e4377d | 667 | !btrfs_is_data_reloc_root(inode->root)) { |
f8a53bb5 CH |
668 | if (should_async_write(bbio) && |
669 | btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) | |
852eee62 | 670 | goto done; |
f8a53bb5 CH |
671 | |
672 | ret = btrfs_bio_csum(bbio); | |
673 | if (ret) | |
852eee62 | 674 | goto fail_put_bio; |
f8a53bb5 | 675 | } |
103c1972 | 676 | } |
f8a53bb5 CH |
677 | |
678 | __btrfs_submit_bio(bio, bioc, &smap, mirror_num); | |
852eee62 CH |
679 | done: |
680 | return map_length == length; | |
9ba0004b | 681 | |
852eee62 CH |
682 | fail_put_bio: |
683 | if (map_length < length) | |
684 | bio_put(bio); | |
9ba0004b CH |
685 | fail: |
686 | btrfs_bio_counter_dec(fs_info); | |
852eee62 CH |
687 | btrfs_bio_end_io(orig_bbio, ret); |
688 | /* Do not submit another chunk */ | |
689 | return true; | |
690 | } | |
691 | ||
ae42a154 | 692 | void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) |
852eee62 | 693 | { |
ae42a154 | 694 | while (!btrfs_submit_chunk(bbio, mirror_num)) |
852eee62 | 695 | ; |
103c1972 CH |
696 | } |
697 | ||
bacf60e5 CH |
698 | /* |
699 | * Submit a repair write. | |
700 | * | |
701 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a | |
702 | * RAID setup. Here we only want to write the one bad copy, so we do the | |
703 | * mapping ourselves and submit the bio directly. | |
704 | * | |
67da05b3 | 705 | * The I/O is issued synchronously to block the repair read completion from |
bacf60e5 CH |
706 | * freeing the bio. |
707 | */ | |
708 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, | |
709 | u64 length, u64 logical, struct page *page, | |
710 | unsigned int pg_offset, int mirror_num) | |
711 | { | |
712 | struct btrfs_device *dev; | |
713 | struct bio_vec bvec; | |
714 | struct bio bio; | |
715 | u64 map_length = 0; | |
716 | u64 sector; | |
717 | struct btrfs_io_context *bioc = NULL; | |
718 | int ret = 0; | |
719 | ||
720 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); | |
721 | BUG_ON(!mirror_num); | |
722 | ||
723 | if (btrfs_repair_one_zone(fs_info, logical)) | |
724 | return 0; | |
725 | ||
726 | map_length = length; | |
727 | ||
728 | /* | |
729 | * Avoid races with device replace and make sure our bioc has devices | |
730 | * associated to its stripes that don't go away while we are doing the | |
731 | * read repair operation. | |
732 | */ | |
733 | btrfs_bio_counter_inc_blocked(fs_info); | |
734 | if (btrfs_is_parity_mirror(fs_info, logical, length)) { | |
735 | /* | |
736 | * Note that we don't use BTRFS_MAP_WRITE because it's supposed | |
737 | * to update all raid stripes, but here we just want to correct | |
738 | * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad | |
739 | * stripe's dev and sector. | |
740 | */ | |
741 | ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, | |
742 | &map_length, &bioc, 0); | |
743 | if (ret) | |
744 | goto out_counter_dec; | |
745 | ASSERT(bioc->mirror_num == 1); | |
746 | } else { | |
747 | ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, | |
748 | &map_length, &bioc, mirror_num); | |
749 | if (ret) | |
750 | goto out_counter_dec; | |
d73a27b8 QW |
751 | /* |
752 | * This happens when dev-replace is also running, and the | |
753 | * mirror_num indicates the dev-replace target. | |
754 | * | |
755 | * In this case, we don't need to do anything, as the read | |
756 | * error just means the replace progress hasn't reached our | |
757 | * read range, and later replace routine would handle it well. | |
758 | */ | |
759 | if (mirror_num != bioc->mirror_num) | |
760 | goto out_counter_dec; | |
bacf60e5 CH |
761 | } |
762 | ||
763 | sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; | |
764 | dev = bioc->stripes[bioc->mirror_num - 1].dev; | |
765 | btrfs_put_bioc(bioc); | |
766 | ||
767 | if (!dev || !dev->bdev || | |
768 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { | |
769 | ret = -EIO; | |
770 | goto out_counter_dec; | |
771 | } | |
772 | ||
773 | bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); | |
774 | bio.bi_iter.bi_sector = sector; | |
775 | __bio_add_page(&bio, page, length, pg_offset); | |
776 | ||
777 | btrfsic_check_bio(&bio); | |
778 | ret = submit_bio_wait(&bio); | |
779 | if (ret) { | |
780 | /* try to remap that extent elsewhere? */ | |
781 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
782 | goto out_bio_uninit; | |
783 | } | |
784 | ||
785 | btrfs_info_rl_in_rcu(fs_info, | |
786 | "read error corrected: ino %llu off %llu (dev %s sector %llu)", | |
787 | ino, start, btrfs_dev_name(dev), sector); | |
788 | ret = 0; | |
789 | ||
790 | out_bio_uninit: | |
791 | bio_uninit(&bio); | |
792 | out_counter_dec: | |
793 | btrfs_bio_counter_dec(fs_info); | |
794 | return ret; | |
795 | } | |
796 | ||
103c1972 CH |
797 | int __init btrfs_bioset_init(void) |
798 | { | |
799 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, | |
800 | offsetof(struct btrfs_bio, bio), | |
801 | BIOSET_NEED_BVECS)) | |
802 | return -ENOMEM; | |
852eee62 CH |
803 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
804 | offsetof(struct btrfs_bio, bio), 0)) | |
805 | goto out_free_bioset; | |
7609afac CH |
806 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
807 | offsetof(struct btrfs_bio, bio), | |
808 | BIOSET_NEED_BVECS)) | |
852eee62 | 809 | goto out_free_clone_bioset; |
7609afac CH |
810 | if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, |
811 | sizeof(struct btrfs_failed_bio))) | |
812 | goto out_free_repair_bioset; | |
103c1972 | 813 | return 0; |
7609afac CH |
814 | |
815 | out_free_repair_bioset: | |
816 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 CH |
817 | out_free_clone_bioset: |
818 | bioset_exit(&btrfs_clone_bioset); | |
7609afac CH |
819 | out_free_bioset: |
820 | bioset_exit(&btrfs_bioset); | |
821 | return -ENOMEM; | |
103c1972 CH |
822 | } |
823 | ||
824 | void __cold btrfs_bioset_exit(void) | |
825 | { | |
7609afac CH |
826 | mempool_exit(&btrfs_failed_bio_pool); |
827 | bioset_exit(&btrfs_repair_bioset); | |
852eee62 | 828 | bioset_exit(&btrfs_clone_bioset); |
103c1972 CH |
829 | bioset_exit(&btrfs_bioset); |
830 | } |