]>
Commit | Line | Data |
---|---|---|
103c1972 CH |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
4 | * Copyright (C) 2022 Christoph Hellwig. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
8 | #include "bio.h" | |
9 | #include "ctree.h" | |
10 | #include "volumes.h" | |
11 | #include "raid56.h" | |
12 | #include "async-thread.h" | |
13 | #include "check-integrity.h" | |
14 | #include "dev-replace.h" | |
15 | #include "rcu-string.h" | |
16 | #include "zoned.h" | |
17 | ||
18 | static struct bio_set btrfs_bioset; | |
19 | ||
20 | /* | |
21 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it | |
22 | * is already initialized by the block layer. | |
23 | */ | |
24 | static inline void btrfs_bio_init(struct btrfs_bio *bbio, | |
25 | btrfs_bio_end_io_t end_io, void *private) | |
26 | { | |
27 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); | |
28 | bbio->end_io = end_io; | |
29 | bbio->private = private; | |
30 | } | |
31 | ||
32 | /* | |
33 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for | |
34 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. | |
35 | * | |
36 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by | |
37 | * a mempool. | |
38 | */ | |
39 | struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, | |
40 | btrfs_bio_end_io_t end_io, void *private) | |
41 | { | |
42 | struct bio *bio; | |
43 | ||
44 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); | |
45 | btrfs_bio_init(btrfs_bio(bio), end_io, private); | |
46 | return bio; | |
47 | } | |
48 | ||
49 | struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, | |
50 | btrfs_bio_end_io_t end_io, void *private) | |
51 | { | |
52 | struct bio *bio; | |
53 | struct btrfs_bio *bbio; | |
54 | ||
55 | ASSERT(offset <= UINT_MAX && size <= UINT_MAX); | |
56 | ||
57 | bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); | |
58 | bbio = btrfs_bio(bio); | |
59 | btrfs_bio_init(bbio, end_io, private); | |
60 | ||
61 | bio_trim(bio, offset >> 9, size >> 9); | |
62 | bbio->iter = bio->bi_iter; | |
63 | return bio; | |
64 | } | |
65 | ||
66 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) | |
67 | { | |
68 | if (!dev || !dev->bdev) | |
69 | return; | |
70 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) | |
71 | return; | |
72 | ||
73 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) | |
74 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
75 | if (!(bio->bi_opf & REQ_RAHEAD)) | |
76 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); | |
77 | if (bio->bi_opf & REQ_PREFLUSH) | |
78 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); | |
79 | } | |
80 | ||
81 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, | |
82 | struct bio *bio) | |
83 | { | |
84 | if (bio->bi_opf & REQ_META) | |
85 | return fs_info->endio_meta_workers; | |
86 | return fs_info->endio_workers; | |
87 | } | |
88 | ||
89 | static void btrfs_end_bio_work(struct work_struct *work) | |
90 | { | |
91 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); | |
92 | ||
93 | bbio->end_io(bbio); | |
94 | } | |
95 | ||
96 | static void btrfs_simple_end_io(struct bio *bio) | |
97 | { | |
98 | struct btrfs_fs_info *fs_info = bio->bi_private; | |
99 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
100 | ||
101 | btrfs_bio_counter_dec(fs_info); | |
102 | ||
103 | if (bio->bi_status) | |
104 | btrfs_log_dev_io_error(bio, bbio->device); | |
105 | ||
106 | if (bio_op(bio) == REQ_OP_READ) { | |
107 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); | |
108 | queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); | |
109 | } else { | |
110 | bbio->end_io(bbio); | |
111 | } | |
112 | } | |
113 | ||
114 | static void btrfs_raid56_end_io(struct bio *bio) | |
115 | { | |
116 | struct btrfs_io_context *bioc = bio->bi_private; | |
117 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
118 | ||
119 | btrfs_bio_counter_dec(bioc->fs_info); | |
120 | bbio->mirror_num = bioc->mirror_num; | |
121 | bbio->end_io(bbio); | |
122 | ||
123 | btrfs_put_bioc(bioc); | |
124 | } | |
125 | ||
126 | static void btrfs_orig_write_end_io(struct bio *bio) | |
127 | { | |
128 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
129 | struct btrfs_io_context *bioc = stripe->bioc; | |
130 | struct btrfs_bio *bbio = btrfs_bio(bio); | |
131 | ||
132 | btrfs_bio_counter_dec(bioc->fs_info); | |
133 | ||
134 | if (bio->bi_status) { | |
135 | atomic_inc(&bioc->error); | |
136 | btrfs_log_dev_io_error(bio, stripe->dev); | |
137 | } | |
138 | ||
139 | /* | |
140 | * Only send an error to the higher layers if it is beyond the tolerance | |
141 | * threshold. | |
142 | */ | |
143 | if (atomic_read(&bioc->error) > bioc->max_errors) | |
144 | bio->bi_status = BLK_STS_IOERR; | |
145 | else | |
146 | bio->bi_status = BLK_STS_OK; | |
147 | ||
148 | bbio->end_io(bbio); | |
149 | btrfs_put_bioc(bioc); | |
150 | } | |
151 | ||
152 | static void btrfs_clone_write_end_io(struct bio *bio) | |
153 | { | |
154 | struct btrfs_io_stripe *stripe = bio->bi_private; | |
155 | ||
156 | if (bio->bi_status) { | |
157 | atomic_inc(&stripe->bioc->error); | |
158 | btrfs_log_dev_io_error(bio, stripe->dev); | |
159 | } | |
160 | ||
161 | /* Pass on control to the original bio this one was cloned from */ | |
162 | bio_endio(stripe->bioc->orig_bio); | |
163 | bio_put(bio); | |
164 | } | |
165 | ||
166 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) | |
167 | { | |
168 | if (!dev || !dev->bdev || | |
169 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || | |
170 | (btrfs_op(bio) == BTRFS_MAP_WRITE && | |
171 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { | |
172 | bio_io_error(bio); | |
173 | return; | |
174 | } | |
175 | ||
176 | bio_set_dev(bio, dev->bdev); | |
177 | ||
178 | /* | |
179 | * For zone append writing, bi_sector must point the beginning of the | |
180 | * zone | |
181 | */ | |
182 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
183 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; | |
184 | ||
185 | if (btrfs_dev_is_sequential(dev, physical)) { | |
186 | u64 zone_start = round_down(physical, | |
187 | dev->fs_info->zone_size); | |
188 | ||
189 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; | |
190 | } else { | |
191 | bio->bi_opf &= ~REQ_OP_ZONE_APPEND; | |
192 | bio->bi_opf |= REQ_OP_WRITE; | |
193 | } | |
194 | } | |
195 | btrfs_debug_in_rcu(dev->fs_info, | |
196 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", | |
197 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, | |
198 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), | |
199 | dev->devid, bio->bi_iter.bi_size); | |
200 | ||
201 | btrfsic_check_bio(bio); | |
202 | submit_bio(bio); | |
203 | } | |
204 | ||
205 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) | |
206 | { | |
207 | struct bio *orig_bio = bioc->orig_bio, *bio; | |
208 | ||
209 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); | |
210 | ||
211 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ | |
212 | if (dev_nr == bioc->num_stripes - 1) { | |
213 | bio = orig_bio; | |
214 | bio->bi_end_io = btrfs_orig_write_end_io; | |
215 | } else { | |
216 | bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); | |
217 | bio_inc_remaining(orig_bio); | |
218 | bio->bi_end_io = btrfs_clone_write_end_io; | |
219 | } | |
220 | ||
221 | bio->bi_private = &bioc->stripes[dev_nr]; | |
222 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; | |
223 | bioc->stripes[dev_nr].bioc = bioc; | |
224 | btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); | |
225 | } | |
226 | ||
227 | void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) | |
228 | { | |
229 | u64 logical = bio->bi_iter.bi_sector << 9; | |
230 | u64 length = bio->bi_iter.bi_size; | |
231 | u64 map_length = length; | |
232 | struct btrfs_io_context *bioc = NULL; | |
233 | struct btrfs_io_stripe smap; | |
234 | int ret; | |
235 | ||
236 | btrfs_bio_counter_inc_blocked(fs_info); | |
237 | ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, | |
238 | &bioc, &smap, &mirror_num, 1); | |
239 | if (ret) { | |
240 | btrfs_bio_counter_dec(fs_info); | |
241 | btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); | |
242 | return; | |
243 | } | |
244 | ||
245 | if (map_length < length) { | |
246 | btrfs_crit(fs_info, | |
247 | "mapping failed logical %llu bio len %llu len %llu", | |
248 | logical, length, map_length); | |
249 | BUG(); | |
250 | } | |
251 | ||
252 | if (!bioc) { | |
253 | /* Single mirror read/write fast path */ | |
254 | btrfs_bio(bio)->mirror_num = mirror_num; | |
255 | btrfs_bio(bio)->device = smap.dev; | |
256 | bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; | |
257 | bio->bi_private = fs_info; | |
258 | bio->bi_end_io = btrfs_simple_end_io; | |
259 | btrfs_submit_dev_bio(smap.dev, bio); | |
260 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | |
261 | /* Parity RAID write or read recovery */ | |
262 | bio->bi_private = bioc; | |
263 | bio->bi_end_io = btrfs_raid56_end_io; | |
264 | if (bio_op(bio) == REQ_OP_READ) | |
265 | raid56_parity_recover(bio, bioc, mirror_num); | |
266 | else | |
267 | raid56_parity_write(bio, bioc); | |
268 | } else { | |
269 | /* Write to multiple mirrors */ | |
270 | int total_devs = bioc->num_stripes; | |
271 | int dev_nr; | |
272 | ||
273 | bioc->orig_bio = bio; | |
274 | for (dev_nr = 0; dev_nr < total_devs; dev_nr++) | |
275 | btrfs_submit_mirrored_bio(bioc, dev_nr); | |
276 | } | |
277 | } | |
278 | ||
bacf60e5 CH |
279 | /* |
280 | * Submit a repair write. | |
281 | * | |
282 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a | |
283 | * RAID setup. Here we only want to write the one bad copy, so we do the | |
284 | * mapping ourselves and submit the bio directly. | |
285 | * | |
286 | * The I/O is issued sychronously to block the repair read completion from | |
287 | * freeing the bio. | |
288 | */ | |
289 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, | |
290 | u64 length, u64 logical, struct page *page, | |
291 | unsigned int pg_offset, int mirror_num) | |
292 | { | |
293 | struct btrfs_device *dev; | |
294 | struct bio_vec bvec; | |
295 | struct bio bio; | |
296 | u64 map_length = 0; | |
297 | u64 sector; | |
298 | struct btrfs_io_context *bioc = NULL; | |
299 | int ret = 0; | |
300 | ||
301 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); | |
302 | BUG_ON(!mirror_num); | |
303 | ||
304 | if (btrfs_repair_one_zone(fs_info, logical)) | |
305 | return 0; | |
306 | ||
307 | map_length = length; | |
308 | ||
309 | /* | |
310 | * Avoid races with device replace and make sure our bioc has devices | |
311 | * associated to its stripes that don't go away while we are doing the | |
312 | * read repair operation. | |
313 | */ | |
314 | btrfs_bio_counter_inc_blocked(fs_info); | |
315 | if (btrfs_is_parity_mirror(fs_info, logical, length)) { | |
316 | /* | |
317 | * Note that we don't use BTRFS_MAP_WRITE because it's supposed | |
318 | * to update all raid stripes, but here we just want to correct | |
319 | * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad | |
320 | * stripe's dev and sector. | |
321 | */ | |
322 | ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, | |
323 | &map_length, &bioc, 0); | |
324 | if (ret) | |
325 | goto out_counter_dec; | |
326 | ASSERT(bioc->mirror_num == 1); | |
327 | } else { | |
328 | ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, | |
329 | &map_length, &bioc, mirror_num); | |
330 | if (ret) | |
331 | goto out_counter_dec; | |
d73a27b8 QW |
332 | /* |
333 | * This happens when dev-replace is also running, and the | |
334 | * mirror_num indicates the dev-replace target. | |
335 | * | |
336 | * In this case, we don't need to do anything, as the read | |
337 | * error just means the replace progress hasn't reached our | |
338 | * read range, and later replace routine would handle it well. | |
339 | */ | |
340 | if (mirror_num != bioc->mirror_num) | |
341 | goto out_counter_dec; | |
bacf60e5 CH |
342 | } |
343 | ||
344 | sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; | |
345 | dev = bioc->stripes[bioc->mirror_num - 1].dev; | |
346 | btrfs_put_bioc(bioc); | |
347 | ||
348 | if (!dev || !dev->bdev || | |
349 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { | |
350 | ret = -EIO; | |
351 | goto out_counter_dec; | |
352 | } | |
353 | ||
354 | bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); | |
355 | bio.bi_iter.bi_sector = sector; | |
356 | __bio_add_page(&bio, page, length, pg_offset); | |
357 | ||
358 | btrfsic_check_bio(&bio); | |
359 | ret = submit_bio_wait(&bio); | |
360 | if (ret) { | |
361 | /* try to remap that extent elsewhere? */ | |
362 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | |
363 | goto out_bio_uninit; | |
364 | } | |
365 | ||
366 | btrfs_info_rl_in_rcu(fs_info, | |
367 | "read error corrected: ino %llu off %llu (dev %s sector %llu)", | |
368 | ino, start, btrfs_dev_name(dev), sector); | |
369 | ret = 0; | |
370 | ||
371 | out_bio_uninit: | |
372 | bio_uninit(&bio); | |
373 | out_counter_dec: | |
374 | btrfs_bio_counter_dec(fs_info); | |
375 | return ret; | |
376 | } | |
377 | ||
103c1972 CH |
378 | int __init btrfs_bioset_init(void) |
379 | { | |
380 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, | |
381 | offsetof(struct btrfs_bio, bio), | |
382 | BIOSET_NEED_BVECS)) | |
383 | return -ENOMEM; | |
384 | return 0; | |
385 | } | |
386 | ||
387 | void __cold btrfs_bioset_exit(void) | |
388 | { | |
389 | bioset_exit(&btrfs_bioset); | |
390 | } |